{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 《自然语言处理入门》随书代码"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "欢迎阅读《自然语言处理入门》随书代码，在开始之前，请运行下面的脚本来自动配置环境："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "环境配置完毕，请开始你的NLP之旅！\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "folders = os.path.abspath(__file__ if '__file__' in globals() else '.').split(os.path.sep)\n",
    "if 'tests' not in folders:\n",
    "    if os.name == 'nt':\n",
    "        print('找不到tests文件夹，请下载https://github.com/hankcs/pyhanlp/archive/master.zip解压后进入pyhanlp-master目录运行')\n",
    "        exit(1)\n",
    "    else:\n",
    "        !wget https://github.com/hankcs/pyhanlp/archive/master.zip && unzip -q master.zip && pip install -e pyhanlp-master\n",
    "        os.chdir('pyhanlp-master/tests')\n",
    "        folders = os.path.abspath('.').split('/')\n",
    "    \n",
    "index = folders.index('tests')\n",
    "# 将tests放入path中，并且排除IPython/extensions下面的tests\n",
    "sys.path = [os.path.sep.join(folders[:index])] + [x for x in sys.path if 'IPython' not in x]\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "# 我们作图要显示中文，但有的机器上没有中文字体，所以这里安装一下\n",
    "import matplotlib\n",
    "from matplotlib import pyplot as plt\n",
    "fontpath = ensure_data('SimHei.ttf', 'https://github.com/StellarCN/scp_zh/raw/master/fonts/SimHei.ttf')\n",
    "myfont = matplotlib.font_manager.FontProperties(fname=fontpath)\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "print('环境配置完毕，请开始你的NLP之旅！')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## 第01章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch01/hello_word.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[王国维/nr, 和/cc, 服务员/nnt]\n"
     ]
    }
   ],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-03-21 21:22\n",
    "# 《自然语言处理入门》1.6 开源工具\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "\n",
    "def main():\n",
    "    HanLP.Config.enableDebug()\n",
    "    #  为了避免你等得无聊，开启调试模式说点什么:-)\n",
    "    print(HanLP.segment(\"王国维和服务员\"))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## 第02章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/evaluate_cws.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 2), (2, 3), (3, 5)]\n",
      "P:91.80 R:95.69 F1:93.71 OOV-R:2.58 IV-R:98.22\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-02 22:53\n",
    "# 《自然语言处理入门》2.9 准确率评测\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import re\n",
    "from pyhanlp import *\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "\n",
    "def to_region(segmentation: str) -> list:\n",
    "    \"\"\"\n",
    "    将分词结果转换为区间\n",
    "    :param segmentation: 商品 和 服务\n",
    "    :return: [(0, 2), (2, 3), (3, 5)]\n",
    "    \"\"\"\n",
    "    region = []\n",
    "    start = 0\n",
    "    for word in re.compile(\"\\\\s+\").split(segmentation.strip()):\n",
    "        end = start + len(word)\n",
    "        region.append((start, end))\n",
    "        start = end\n",
    "    return region\n",
    "\n",
    "\n",
    "def prf(gold: str, pred: str, dic) -> tuple:\n",
    "    \"\"\"\n",
    "    计算P、R、F1\n",
    "    :param gold: 标准答案文件，比如“商品 和 服务”\n",
    "    :param pred: 分词结果文件，比如“商品 和服 务”\n",
    "    :param dic: 词典\n",
    "    :return: (P, R, F1, OOV_R, IV_R)\n",
    "    \"\"\"\n",
    "    A_size, B_size, A_cap_B_size, OOV, IV, OOV_R, IV_R = 0, 0, 0, 0, 0, 0, 0\n",
    "    with open(gold, encoding='utf-8') as gd, open(pred, encoding='utf-8') as pd:\n",
    "        for g, p in zip(gd, pd):\n",
    "            A, B = set(to_region(g)), set(to_region(p))\n",
    "            A_size += len(A)\n",
    "            B_size += len(B)\n",
    "            A_cap_B_size += len(A & B)\n",
    "            text = re.sub(\"\\\\s+\", \"\", g)\n",
    "            for (start, end) in A:\n",
    "                word = text[start: end]\n",
    "                if dic.containsKey(word):\n",
    "                    IV += 1\n",
    "                else:\n",
    "                    OOV += 1\n",
    "\n",
    "            for (start, end) in A & B:\n",
    "                word = text[start: end]\n",
    "                if dic.containsKey(word):\n",
    "                    IV_R += 1\n",
    "                else:\n",
    "                    OOV_R += 1\n",
    "    p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100\n",
    "    return p, r, 2 * p * r / (p + r), OOV_R / OOV * 100, IV_R / IV * 100\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    print(to_region('商品 和 服务'))\n",
    "\n",
    "    sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')\n",
    "    msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')\n",
    "    msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')\n",
    "    msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt')\n",
    "    msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')\n",
    "\n",
    "    DoubleArrayTrieSegment = JClass('com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment')\n",
    "    segment = DoubleArrayTrieSegment([msr_dict]).enablePartOfSpeechTagging(True)\n",
    "    with open(msr_gold, encoding='utf-8') as test, open(msr_output, 'w', encoding='utf-8') as output:\n",
    "        for line in test:\n",
    "            output.write(\"  \".join(term.word for term in segment.seg(re.sub(\"\\\\s+\", \"\", line))))\n",
    "            output.write(\"\\n\")\n",
    "    print(\"P:%.2f R:%.2f F1:%.2f OOV-R:%.2f IV-R:%.2f\" % prf(msr_gold, msr_output, segment.trie))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/dat.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-26 21:16\n",
    "# 《自然语言处理入门》2.5 双数组字典树\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "\n",
    "\n",
    "class DoubleArrayTrie(object):\n",
    "    def __init__(self, dic: dict) -> None:\n",
    "        m = JClass('java.util.TreeMap')()\n",
    "        for k, v in dic.items():\n",
    "            m[k] = v\n",
    "        DoubleArrayTrie = JClass('com.hankcs.hanlp.collection.trie.DoubleArrayTrie')\n",
    "        dat = DoubleArrayTrie(m)\n",
    "        self.base = dat.getBase()\n",
    "        self.check = dat.getCheck()\n",
    "        self.value = dat.getValueArray([''])\n",
    "\n",
    "    @staticmethod\n",
    "    def char_hash(c) -> int:\n",
    "        return JClass('java.lang.Character')(c).hashCode()\n",
    "\n",
    "    def transition(self, c, b) -> int:\n",
    "        \"\"\"\n",
    "        状态转移\n",
    "        :param c: 字符\n",
    "        :param b: 初始状态\n",
    "        :return: 转移后的状态，-1表示失败\n",
    "        \"\"\"\n",
    "        p = self.base[b] + self.char_hash(c) + 1\n",
    "        if self.base[b] == self.check[p]:\n",
    "            return p\n",
    "        else:\n",
    "            return -1\n",
    "\n",
    "    def __getitem__(self, key: str):\n",
    "        b = 0\n",
    "        for i in range(0, len(key)):  # len(key)次状态转移\n",
    "            p = self.transition(key[i], b)\n",
    "            if p != -1:\n",
    "                b = p\n",
    "            else:\n",
    "                return None\n",
    "\n",
    "        p = self.base[b]  # 按字符'\\0'进行状态转移\n",
    "        n = self.base[p]  # 查询base\n",
    "        if p == self.check[p] and n < 0:  # 状态转移成功且对应词语结尾\n",
    "            index = -n - 1  # 取得字典序\n",
    "            return self.value[index]\n",
    "        return None\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    dic = {'自然': 'nature', '自然人': 'human', '自然语言': 'language', '自语': 'talk\tto oneself', '入门': 'introduction'}\n",
    "    dat = DoubleArrayTrie(dic)\n",
    "    assert dat['自然'] == 'nature'\n",
    "    assert dat['自然语言'] == 'language'\n",
    "    assert dat['不存在'] is None\n",
    "    assert dat['自然\\0在'] is None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/bidirectional_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['研究', '生命', '起源']\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-24 21:23\n",
    "# 《自然语言处理入门》2.3.4 双向最长匹配\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tests.book.ch02.backward_segment import backward_segment\n",
    "from tests.book.ch02.forward_segment import forward_segment\n",
    "from tests.book.ch02.utility import load_dictionary\n",
    "import os\n",
    "from pyhanlp import *\n",
    "from pyhanlp.static import HANLP_DATA_PATH\n",
    "\n",
    "def count_single_char(word_list: list):  # 统计单字成词的个数\n",
    "    return sum(1 for word in word_list if len(word) == 1)\n",
    "\n",
    "\n",
    "def bidirectional_segment(text, dic):\n",
    "    f = forward_segment(text, dic)\n",
    "    b = backward_segment(text, dic)\n",
    "    if len(f) < len(b):                                  # 词数更少优先级更高\n",
    "        return f\n",
    "    elif len(f) > len(b):\n",
    "        return b\n",
    "    else:\n",
    "        if count_single_char(f) < count_single_char(b):  # 单字更少优先级更高\n",
    "            return f\n",
    "        else:\n",
    "            return b                                     # 都相等时逆向匹配优先级更高\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    HanLP.Config.CoreDictionaryPath = os.path.join(HANLP_DATA_PATH, \"dictionary/CoreNatureDictionary.txt\")\n",
    "    dic = load_dictionary()\n",
    "\n",
    "    print(bidirectional_segment('研究生命起源', dic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/utility.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "85584\n",
      "青鸿\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-24 22:11\n",
    "# 《自然语言处理入门》2.2.2 词典的加载\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "\n",
    "def load_dictionary():\n",
    "    \"\"\"\n",
    "    加载HanLP中的mini词库\n",
    "    :return: 一个set形式的词库\n",
    "    \"\"\"\n",
    "    IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')\n",
    "    path = HanLP.Config.CoreDictionaryPath.replace('.txt', '.mini.txt')\n",
    "    dic = IOUtil.loadDictionary([path])\n",
    "    return set(dic.keySet())\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    dic = load_dictionary()\n",
    "    print(len(dic))\n",
    "    print(list(dic)[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/aho_corasick_double_array_trie.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1:4]=SHE\n",
      "[2:4]=HE\n",
      "[2:6]=HERS\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-28 18:10\n",
    "# 《自然语言处理入门》2.7 基于双数组字典树的 AC 自动机\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "\n",
    "\n",
    "def classic_demo():\n",
    "    words = [\"hers\", \"his\", \"she\", \"he\"]\n",
    "    map = JClass('java.util.TreeMap')()     # 创建TreeMap实例\n",
    "    for word in words:\n",
    "        map[word] = word.upper()            # 存放键值对\n",
    "    trie = JClass('com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie')(map)\n",
    "    for hit in trie.parseText(\"ushers\"):    # 遍历查询结果\n",
    "        print(\"[%d:%d]=%s\" % (hit.begin, hit.end, hit.value))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    classic_demo()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/zipf_law.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('，', 173173), ('的', 128146), ('。', 81757), ('、', 40695), ('在', 28445), ('了', 27103), ('和', 24398), ('是', 18068), ('”', 16867), ('“', 16686), ('一', 11503), ('有', 9905), ('对', 9654), ('为', 9516), ('中', 9444), ('上', 8408), ('不', 7222), ('这', 7198), ('与', 7197), ('他', 7062), ('就', 6485), ('人', 6338), ('到', 6316), ('等', 6008), ('：', 5988), ('发展', 5976), ('说', 5973), ('也', 5801), ('要', 5660), ('将', 5651)]\n"
     ]
    },
    {
     "data": {
      "text/plain": "<Figure size 576x432 with 1 Axes>",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAicAAAF/CAYAAACSZWnaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3XmcnWV9///X+8yazJJ1ZrIBATIJDLKIKUZpLQhCcAsqKlgFlUq1ovWLVcGqqBW11YobYqkgYP2JlFqJiiIiVtsKGhSQLcmwJ4RkyEb22T6/P+7rJCfDTJLJ5MyZOfN+PpjHuc91b9c5TM68z3Vf13UrIjAzMzMbKXKlroCZmZlZIYcTMzMzG1EcTszMzGxEcTgxMzOzEcXhxMzMzEYUhxMzMzMbURxOzOx5JFXlHyVV9FlXPcRj10iaPMC610iqlJSTpH08ngrruKd9JdVKatm/mpvZcHE4MbP+/FjSDOBC4KeSlkn6g6RbgVsk1UsaJ6la0u0pcFRJqgWQ9DVJ8wY49kLgs/knkioKwsRhwN8CrwB+IunHBT9PSmpN4eNESf8g6VTg6FTfhyX9D/CTVIakV0j6VMG5ZwNX9q2QpM9LOrJP2f8N9k0zswNDnoTNrDxJuhaoi4g3SroBqAAqgcnAk8C5wLeBY4HtwArgLRHRJWkB0Ab8BzAL+Hvgv4B7gE0RsVHSm4G5wIuB24GngHcCHwXeDXwxItpTXf4APJ2qVgdsKahqBXBhRDwiqQZoBTYDxwC9aZvVwN8An0l1/0k65xuBWyLiU5K+BFweEU8VvAc3AR8HJpIFoi6gCtgBjAPeERGPSjoM+AFwSkSsTfv+KiJOGuz7bmZDV1nqCphZUR2THo8FJgFfioh/lvQtYH5a976I+B9J3wZOlXR7RNwJ3ClpLnAOcBqwCngHcB/wKeBGshAC0BARN0n6X+CLwKY+9XgOeB3QDVwO/CwifpZaTKoiolPSe4HjgP8vbV/4zanvck9a/khE/ErSZ9K+H0iXbd4JvApYD5wKzIuIkyVdA3wTWBIRvTsPmAWUhflgYmal5XBiVt46JU1hV4vB6yT9V0T8NUD+akoKCfVAJ/ANSScA34iIb6Y//CdHxMckXQRsTMf+APBKspAzTtIbgFcD5wFf71OPHuDTwALgUGC2pHcDApYCH46IKyS9FTgsIu6Q9HWgkSyMvLzgWB8EpgCLgEWSnk51P5PsUvW1ZC0uE4G1wIeAYyWdAzweEb+TdEV6H34h6XPAS4BHJK0BXpT2P0bSz8g+J78cET/ej/ffzPaDw4lZebsXeHN6/ANwC/ADSXcA/y9t8zWySz0/An4ZEbenkFAl6X1ANfBzSR8AaoEaSSdHxOWSHgO+DNxMdllkBvAv/dQjgMvILiNdBrwJ+HeyS0Hr+2ybbxXpjIiTJN0QEZEPUhHxRUldwHjgd2SB6iPAn5OFn/x235b0SeB96ZjXAfdLegVZiHmJpJdExCWSZgOfiohL8vunyzoL9/oOm9kB5w6xZuXtD8Db02MV8DOyyx9NwFvTNu8j6yT6SOzeCS1HFib+DDiFrGXiNCBSy0YNWYfZx8i+6JwB/BD4FVm/kX+W9Pp0rB6yvia/BKal9X8GnEgWaPZkt88pSUcALyRrDVoAnEXWAlPDrn4tSDqZrNXkb8mC2GTgWeArwPeAP4+IHQWHDkn+TDQbAfwP0ay8/YEsBPyBLEC8LiJ6gPvJWkHy/hU4v8+w4feRdZqdFhEnpc6hf0024gWyjqbfJuvcejlZUPkYWdjoAj4eET+Q1EDW1+Q/yQLOW4E5ZJdl3kp2GekF6ZgCFki6EDhO0i+Al0oqbOXdls4N8Dayjq4PAsenxyfIOtneDbRFxOnAv5G1sOwcYhwRW/u8V+PIRiI1DPRmmtnw8GUds/L2OLCM7A/2JODtkt5B1m/kHLK+FkTEekm/BN5A1tF1Elmn1m+RXYLpzxcjYoOkt6QWl1fmV0g6lmxEDMBJwAMR8eGC9d8CPp8fzVPgGOBwshE/N0bEmrT9HLJAswNYR9Zasgh4Y0SsTqNyPg08DDwaETsk/TlwVhrS/Me03893VUGvA56NiN8ADel4Z0VE3868ZjbMHE7MylREvD0t5ucbOa2fzfLbEBHvLyifTDasuFPS9NSCAVnrwtK0/YZUNr6f41anH8g6mP5Xn/WV9P/58zVgVUR09Sl/M/DvEbFK0pnAkcCr05Dms4C3AGcDG4B/lPQasv4s3wX+L+3/DrIRPEcDn0vHzQeqDuDsfKfX1EG4pp/6mdkw8DwnZrZHkqojorPU9RgKSePI+spsL3VdzGzvHE7MzMxsRHGHWDMzMxtRHE7MzMxsRHGH2GTq1Kkxe/bsQe3z+LNb6OoNWpvri1MpMzOzIrn77rufjYimUtejPw4nyezZs1myZMmg9rnsJw9y/W+f4K5PL6Qit093dzczMxsRJD1R6joMxJd1hmBuSwM7unt5cl3fuZzMzMxsfzmcDMHclmwiyaXPeM4mMzOzA8XhZAhaW7K+JstXO5yYmZkdKA4nQzC+upKDJo9jqcOJmZnZAeNwMkTzWhpYvnpzqathZmZWNhxOhqi1pYFHn91MV09vqatiZmZWFhxOhmheSwNdPcHjz24pdVXMzMzKgsPJEOU7xbrfiZmZ2YHhcDJEhzfVkxMsc78TMzOzA8LhZIhqqyqYPaWOZZ7rxMzM7IBwODkA5rY0sGyNw4mZmdmB4HByAMxtqefxZ7ewvaun1FUxMzMb9RxODoC50xroDXi0wyN2zMzMhsrh5ADI32NnmUfsmJmZDZnDyQEwe0odlTk5nJiZmR0ADicHQHVljsOa6jyc2MzM7ABwODlAWlsa3HJiZmZ2ADicHCDzWhp4av1WtnZ2l7oqZmZmo5rDyQEyt6WeCGhf40s7ZmZmQ+FwcoDsGrHjcGJmZjYURQsnkq6RtEbS/X3K3yfpYUkPSPrngvJLJLVLWirp9ILyhamsXdLFBeWHSrorlX9fUnUqr0nP29P62cV6jYUOmVJHdWXO/U7MzMyGqJgtJ9cCCwsLJJ0MLAKOjYijgC+m8jbgbOCotM83JFVIqgCuAM4A2oBz0rYA/wRcHhFzgPXA+an8fGB9Kr88bVd0FTkxp6ne4cTMzGyIihZOIuLXwLo+xe8BPh8RO9I2a1L5IuCGiNgREY8B7cAJ6ac9Ih6NiE7gBmCRJAEvB25K+18HnFlwrOvS8k3AKWn7opvbUu8bAJqZmQ3RcPc5mQv8Rbrc8t+S/iyVzwSeKthuRSobqHwKsCEiuvuU73astH5j2v55JF0gaYmkJR0dHUN/cdMaeHrjdjZt7xrysczMzMaq4Q4nlcBkYAHwIeDG4WrV6E9EXBUR8yNiflNT05CPN7fZnWLNzMyGarjDyQrgB5H5HdALTAVWAgcVbDcrlQ1UvhaYKKmyTzmF+6T1E9L2RTdvWhZOlrvfiZmZ2X4b7nDyQ+BkAElzgWrgWWAxcHYaaXMo0Ar8Dvg90JpG5lSTdZpdHBEB3AGclY57HnBzWl6cnpPW/zJtX3QzJ45jXFUFSx1OzMzM9lvl3jfZP5K+B5wETJW0ArgUuAa4Jg0v7gTOS8HhAUk3Ag8C3cB7I6InHedC4FagArgmIh5Ip/gIcIOkzwB/BK5O5VcD35HUTtYh9+xivca+cjkxt6We5b6sY2Zmtt+KFk4i4pwBVr11gO0vAy7rp/wW4JZ+yh8lG83Tt3w78MZBVfYAam1p4L+XDb1zrZmZ2VjlGWIPsHktDXRs2sGGrZ2lroqZmdmo5HBygLW21AMesWNmZra/HE4OsPw9dtwp1szMbP84nBxg0yfU0lBT6eHEZmZm+8nh5ACTRGtLPUs9jb2Zmdl+cTgpgnnTGli2ehPDNL2KmZlZWXE4KYLW5gbWb+3i2c0esWNmZjZYDidF4GnszczM9p/DSRHkhxN7xI6ZmdngOZwUQVN9DZPGV3muEzMzs/3gcFIE2YidrFOsmZmZDY7DSZHMa/GIHTMzs/3hcFIkc1vq2bS9m2ee217qqpiZmY0qDidFkp/G3v1OzMzMBsfhpEh2hhPPFGtmZjYoDidFMqmumqaGGneKNTMzGySHkyKa21LPsjW+rGNmZjYYDidFNLelgeWrN9Hb6xE7ZmZm+8rhpIjmtjSwtbOHlRu2lboqZmZmo4bDSRHNTdPYu9+JmZnZvnM4KaJWDyc2MzMbNIeTImqsrWL6hFq3nJiZmQ2Cw0mRzfU9dszMzAbF4aTI5rbU075mMz0esWNmZrZPHE6KbG5LAzu6e3ly3dZSV8XMzGxUcDgpsvw09ks9jb2Zmdk+cTgpstY0nHi5+52YmZntE4eTIhtfXclBk8ex1OHEzMxsnxQtnEi6RtIaSff3s+6DkkLS1PRckr4qqV3SfZKOL9j2PEnL0895BeUvkvSntM9XJSmVT5Z0W9r+NkmTivUa99W8lgaWe64TMzOzfVLMlpNrgYV9CyUdBJwGPFlQfAbQmn4uAK5M204GLgVeDJwAXFoQNq4E3lWwX/5cFwO3R0QrcHt6XlKtLQ08+uxmunp6S10VMzOzEa9o4SQifg2s62fV5cCHgcKxtYuA6yNzJzBR0nTgdOC2iFgXEeuB24CFaV1jRNwZEQFcD5xZcKzr0vJ1BeUlM6+lga6e4PFnt5S6KmZmZiPesPY5kbQIWBkR9/ZZNRN4quD5ilS2p/IV/ZQDtETEqrT8DNCyh/pcIGmJpCUdHR2DfTn7rHXnPXZ8acfMzGxvhi2cSBoPfBT4xHCdM7WqDDj7WURcFRHzI2J+U1NT0epxeFM9OeFOsWZmZvtgOFtODgcOBe6V9DgwC/iDpGnASuCggm1npbI9lc/qpxxgdbrsQ3pcc8BfySDVVlUwe0qdhxObmZntg2ELJxHxp4hojojZETGb7FLM8RHxDLAYODeN2lkAbEyXZm4FTpM0KXWEPQ24Na17TtKCNErnXODmdKrFQH5Uz3kF5SU1t6XBLSdmZmb7oJhDib8H/BaYJ2mFpPP3sPktwKNAO/BvwN8CRMQ64B+B36efT6cy0jbfSvs8Avw0lX8eeIWk5cCp6XnJzW2p54m1W9ne1VPqqpiZmY1olcU6cEScs5f1swuWA3jvANtdA1zTT/kS4AX9lK8FThlkdYuutaWBnt7g0Y4ttM1oLHV1zMzMRizPEDtM5k3L7rGzfI0v7ZiZme2Jw8kwmT2ljsqcfANAMzOzvXA4GSbVlTkOa6rzXCdmZmZ74XAyjFpbGljmETtmZmZ75HAyjOa1NPDU+q1s7ewudVXMzMxGLIeTYTS3pZ4IaF/jSztmZmYDcTgZRnNbshE77ndiZmY2MIeTYXTIlDqqK3Pud2JmZrYHDifDqCIn5jTVO5yYmZntgcPJMJvbUs9yX9YxMzMbkMPJMJs7rYGVG7axaXtXqatiZmY2IjmcDLO5zflp7N16YmZm1h+Hk2GWv8fOMk9jb2Zm1i+Hk2E2c+I4xlVVeDixmZnZABxOhlkuJ+a2eMSOmZnZQBxOSsD32DEzMxuYw0kJzG2pZ82mHWzY2lnqqpiZmY04Dicl0NriETtmZmYDcTgpgdbmegBPxmZmZtYPh5MSmDlxHHXVFSxf434nZmZmfTmclIAk5jTX0+7LOmZmZs/jcFIic5o9YsfMzKw/Dicl0tpSz+rndrBxm++xY2ZmVsjhpETynWJ9acfMzGx3DiclMjcNJ253p1gzM7PdOJyUyMyJ46itynk4sZmZWR8OJyWSy2Ujdpb5so6ZmdluHE5KqLW5gXaP2DEzM9uNw0kJzWmu5+mN29m03SN2zMzM8ooWTiRdI2mNpPsLyr4g6WFJ90n6L0kTC9ZdIqld0lJJpxeUL0xl7ZIuLig/VNJdqfz7kqpTeU163p7Wzy7Waxyq/IidRzq2lLgmZmZmI0cxW06uBRb2KbsNeEFEHAMsAy4BkNQGnA0clfb5hqQKSRXAFcAZQBtwTtoW4J+AyyNiDrAeOD+Vnw+sT+WXp+1GpPyIHU/GZmZmtkvRwklE/BpY16fs5xHRnZ7eCcxKy4uAGyJiR0Q8BrQDJ6Sf9oh4NCI6gRuARZIEvBy4Ke1/HXBmwbGuS8s3Aaek7UecgyaPp7oy57lOzMzMCpSyz8k7gZ+m5ZnAUwXrVqSygcqnABsKgk6+fLdjpfUb0/bPI+kCSUskLeno6BjyCxqsipw4vKme5W45MTMz26kk4UTSPwDdwHdLcf68iLgqIuZHxPympqaS1KG1uZ7lbjkxMzPbadjDiaS3A68G/ioiIhWvBA4q2GxWKhuofC0wUVJln/LdjpXWT0jbj0itzfWsWL+NrZ3de9/YzMxsDBjWcCJpIfBh4LURsbVg1WLg7DTS5lCgFfgd8HugNY3MqSbrNLs4hZo7gLPS/ucBNxcc67y0fBbwy4IQNOK07pzG3q0nZmZmUNyhxN8DfgvMk7RC0vnA14EG4DZJ90j6JkBEPADcCDwI/Ax4b0T0pD4jFwK3Ag8BN6ZtAT4CXCSpnaxPydWp/GpgSiq/CNg5/Hgkam3JhhN7GnszM7NM5d432T8RcU4/xVf3U5bf/jLgsn7KbwFu6af8UbLRPH3LtwNvHFRlS+iQyeOpqpD7nZiZmSWeIbbEKityHDa13ncnNjMzSxxORoA5LR6xY2ZmludwMgLMbW7gyXVb2dbZU+qqmJmZlZzDyQjQ2lJPBDzS4dYTMzMzh5MRIH8DQA8nNjMzczgZEQ6ZUkdlTix3p1gzMzOHk5GgujLH7Kl1nuvEzMwMh5MRw/fYMTMzyzicjBCtLQ08sXYL27s8YsfMzMY2h5MRorW5nt6Ax57dUuqqmJmZlZTDyQix8x47vrRjZmZjnMPJCHHo1DpygvbVHrFjZmZjm8PJCFFTWcHsKXUs84gdMzMb4xxORpDWlnrPdWJmZmOew8kI0trcwONrt9LZ3VvqqpiZmZWMw8kI0tpST09v8Phaj9gxM7Oxy+FkBJmT7rHjmWLNzGwsczgZQQ5vqkeCZR6xY2ZmY5jDyQhSW1XBwZPH++7EZmY2pjmcjDCtzQ0esWNmZmOaw8kI09pSz2PPbqGrxyN2zMxsbHI4GWFam+vp6gmeWLu11FUxMzMrCYeTEaa1uQGA5e4Ua2ZmY5TDyQhzeHMd4BsAmpnZ2OVwMsKMr67koMnjHE7MzGzMcjgZgVqbG3xZx8zMxiyHkxGotbmeR5/dQrdH7JiZ2RjkcDICzWmup7O7lyfXecSOmZmNPQ4nI1BrSxqx434nZmY2BhUtnEi6RtIaSfcXlE2WdJuk5elxUiqXpK9Kapd0n6TjC/Y5L22/XNJ5BeUvkvSntM9XJWlP5xhN8jcA9DT2ZmY2FhWz5eRaYGGfsouB2yOiFbg9PQc4A2hNPxcAV0IWNIBLgRcDJwCXFoSNK4F3Fey3cC/nGDXqayqZOXGcO8WamdmYVLRwEhG/Btb1KV4EXJeWrwPOLCi/PjJ3AhMlTQdOB26LiHURsR64DViY1jVGxJ0REcD1fY7V3zlGlTnN9b6sY2ZmY9Jw9zlpiYhVafkZoCUtzwSeKthuRSrbU/mKfsr3dI7nkXSBpCWSlnR0dOzHyyme1uZ62tdspqc3Sl0VMzOzYVWyDrGpxaOof3n3do6IuCoi5kfE/KampmJWZdBaW+rZ0d3LivUesWNmZmPLcIeT1emSDOlxTSpfCRxUsN2sVLan8ln9lO/pHKPKnJ332PGlHTMzG1uGO5wsBvIjbs4Dbi4oPzeN2lkAbEyXZm4FTpM0KXWEPQ24Na17TtKCNErn3D7H6u8co0prSzZix/1OzMxsrKks1oElfQ84CZgqaQXZqJvPAzdKOh94AnhT2vwW4JVAO7AVeAdARKyT9I/A79N2n46IfCfbvyUbETQO+Gn6YQ/nGFUaa6uY1ljrETtmZjbmFC2cRMQ5A6w6pZ9tA3jvAMe5Brimn/IlwAv6KV/b3zlGo9YWj9gxM7OxxzPEjmBz0oidXo/YMTOzMcThZARrbW5gW1cPKzdsK3VVzMzMho3DyQg2t8XT2JuZ2djjcDKC5e+xs8ydYs3MbAxxOBnBJo6vpqmhxp1izcxsTHE4GeFafY8dMzMbYxxORrjW5nraV28iG21tZmZW/hxORrjWlga2dPawauP2UlfFzMxsWDicjHCt7hRrZmZjjMPJCNfakt0A0MOJzcxsrHA4GeEm11Uzpa7adyc2M7Mxw+FkFJjTXM/yNb6sY2ZmY8OA4USZU9PyDEmHFfwcIikn6T3DV9WxK38DQI/YMTOzsWBvdyX+APAL4DJgfUH5KmAi8ESR6mUF5rY0sGl7N6uf28G0CbWlro6ZmVlRDdhyEtnX9OmSTgS2Ap8DpgA7yMLJbyLiqmGp5RiXn8bel3bMzGws2FufEwHHA4cC44BrgV8B04CPSJpVzMpZprU5G7HjTrFmZjYWDHhZR1IOWB0RX5Mk4ONAkAWWduBdwNXA6cNR0bFsan01E8dXeRp7MzMbE/Z0WacXOE/SFRHxVaAV+A4wGfhhRLQDnxyWWo5xkrJp7H1Zx8zMxoC9XdZ5DXCipHPTtocD9wC3S/qLiPhtsStomdaWBpat9ogdMzMrf3sLJ9uATwNbgN5U9gDZKJ7PSGopYt2sQGtzPRu3ddGxeUepq2JmZlZUewsnTwJ/A7wDqAKqgXcC9wL/BFxS1NrZTvlOse3uFGtmZmVub/Oc1ACfTcszgRVAB3AM8DRwU/GqZoVaW/LDiTfz0jlTS1wbMzOz4tlbOPk6WSdYyEbpHFqw7vXAEuB/ilAv66O5oYaG2krPdWJmZmVvb+GkHfg1WTBZmrbPkV3i6QZeVdTa2U75ETvLfFnHzMzK3N7CCUAjWTi5gaylRMCiiJgLfLOIdbM+5rY08PMHV5e6GmZmZkU1qLsSR8SHI+JD+J46JTGnuZ51WzpZ6xE7ZmZWxvbWclKRfgCWS7qRrOWkRtLUiHi2qLWz3bS2pGns12xmSn1NiWtjZmZWHHsLJxcDK8nuA/ijfKGkjwO+Pe4wa83fAHD1JhYcNqXEtTEzMyuOAS/rSGoEzgUOBk6W9ClJn5D0CeAI4GZJ9ftzUkn/T9IDku6X9D1JtZIOlXSXpHZJ35dUnbatSc/b0/rZBce5JJUvlXR6QfnCVNYu6eL9qeNINH1CLQ21lTz8jEfsmJlZ+dpTn5P5wK8j4o9kN/f7OXB7+rmS7KZ/g249kTQTeD8wPyJeQHbZ6GyySd0uj4g5wHrg/LTL+cD6VH552g5JbWm/o4CFwDckVUiqAK4AzgDagHPStqOeJI6c3siDq54rdVXMzMyKZsDLOhHxy4Knv4iI/+2zyVDmN6kExknqAsYDq4CXA29J668ju6nglcAidt1g8Cbg6+kuyYuAGyJiB/CYpHbghLRde0Q8CiDphrTtg0Oo74jRNr2RG5c8RU9vUJFTqatjZmZ2wO3TaJ2IuPFAnTAiVgJfJJsafxWwEbgb2BAR3WmzFWQz0pIen0r7dqftpxSW99lnoPLnkXSBpCWSlnR0dAz9xQ2DtumNbO3s4Ym1W0pdFTMzs6IY1FDiA0HSJLKWjEOBGUAd2WWZYRcRV0XE/IiY39TUVIoqDFrbjEYAHlrlfidmZlaehj2cAKcCj0VER0R0AT8ATgQmSspfZppFNkqI9HgQQFo/AVhbWN5nn4HKy8Kc5noqcuLBVRtLXRUzM7OiKEU4eRJYIGl86jtyCll/kDuAs9I25wE3p+XF6Tlp/S8jIlL52Wk0z6FAK/A74PdAaxr9U03WaXbxMLyuYVFbVcGcpnoefNqdYs3MrDzty/T1B1RE3CXpJuAPZPfn+SNwFfAT4AZJn0llV6ddrga+kzq8riMLG0TEA2lSuAfTcd4bET0Aki4EbiUbCXRNRDwwXK9vOLTNaOS3j6wtdTXMzMyKQlkjhM2fPz+WLFlS6mrsk6t+/QifveVh7v7YqZ4p1szM9oukuyNifqnr0Z9SXNaxIWqbPgFwp1gzMytPDiej0JHTs3vsPOTJ2MzMrAw5nIxCU+praGms8UyxZmZWlhxORqm26Y0esWNmZmXJ4WSUapvRyCMdm9ne1VPqqpiZmR1QDiej1JHTG+nuDdrXbC51VczMzA4oh5NRqm16No29L+2YmVm5cTgZpQ6ZUsf46gp3ijUzs7LjcDJKVeTEvGkNDidmZlZ2HE5GsbbpjTy06jk8y6+ZmZUTh5NRrG1GI5u2d7Ni/bZSV8XMzOyAcTgZxY7Md4r1pR0zMysjDiej2BHTGpA8YsfMzMqLw8koNr66kkOn1vkeO2ZmVlYcTka5I6c3+rKOmZmVFYeTUa5teiMr1m9j47auUlfFzMzsgHA4GeXaZmSdYh9264mZmZUJh5NRrs0jdszMrMw4nIxyzQ01TKmr9ogdMzMrGw4no5wk2mY08tAzDidmZlYeHE7KwJHTG1n2zGa6enpLXRUzM7MhczgpA23TG+ns6eWRjs2lroqZmdmQOZyUgfyIHU/GZmZm5cDhpAwcNrWO6sqcO8WamVlZcDgpA5UVOea1NHg4sZmZlQWHkzLRNr2Rh1ZtIiJKXRUzM7MhcTgpE20zGlm3pZPVz+0odVXMzMyGxOGkTBy5c6bYjSWuiZmZ2dA4nJSJI6Y3APDQqk0lromZmdnQlCScSJoo6SZJD0t6SNJLJE2WdJuk5elxUtpWkr4qqV3SfZKOLzjOeWn75ZLOKyh/kaQ/pX2+KkmleJ3DqbG2ioMnj/eIHTMzG/VK1XLyFeBnEXEEcCzwEHAxcHtEtAK3p+cAZwCt6ecC4EoASZOBS4EXAycAl+YDTdrmXQX7LRyG11RyR073iB0zMxv9hj2cSJoAvAy4GiAiOiNiA7AIuC5tdh1wZlpeBFwfmTuBiZKmA6cDt0XEuohYD9wGLEzrGiPizsiGrlxfcKyy1jZ9Ao+v3cKWHd2lroqZmdl+K0XLyaFAB/BtSX+U9C1JdUBLRKxK2zwDtKTlmcBTBfuvSGV7Kl/RT/nzSLpA0hJJSzo6Oob4skqvbUYjEfDwM+53YmZmo1cpwkklcDxwZUS8ENjCrks4AKQWj6JP2BERV0XE/IiY39TUVOzTFd2RqVOsL+2YmdloVopwsgJYERF3pednvHI2AAAgAElEQVQ3kYWV1emSDOlxTVq/EjioYP9ZqWxP5bP6KS97MyeOo7G20vfYMTOzUW3Yw0lEPAM8JWleKjoFeBBYDORH3JwH3JyWFwPnplE7C4CN6fLPrcBpkialjrCnAbemdc9JWpBG6ZxbcKyyJom2GY0esWNmZqNaZYnO+z7gu5KqgUeBd5AFpRslnQ88AbwpbXsL8EqgHdiatiUi1kn6R+D3abtPR8S6tPy3wLXAOOCn6WdMOHJ6I9/73ZP09AYVubIfQW1mZmWoJOEkIu4B5vez6pR+tg3gvQMc5xrgmn7KlwAvGGI1R6W26Y1s7+rl8bVbOLypvtTVMTMzGzTPEFtm2makaex9acfMzEYph5MyM6e5nsqcPGLHzMxGLYeTMlNTWcGc5nqP2DEzs1HL4aQMecSOmZmNZg4nZahteiNrNu3g2c07Sl0VMzOzQXM4KUNt07NOsb60Y2Zmo5HDSRk6crpH7JiZ2ejlcFKGJtVVM31CrUfsmJnZqORwUqbapjf6so6ZmY1KDidlqm1GI490bGF7V0+pq2JmZjYoDidl6sjpjfT0BstWbyp1VczMzAbF4aRMecSOmZmNVg4nZergyeOpq67wiB0zMxt1HE7KVC4njpjeyEOrfFnHzMxGF4eTMtY2vZEHVz1Hb2+UuipmZmb7zOGkjLXNaGTzjm5WrN9W6qqYmZntM4eTMrZzplh3ijUzs1HE4aSMzWtpICeHEzMzG10cTsrYuOoKDmuq94gdMzMbVRxOytyRnsbezMxGGYeTMtc2vZGVG7axcWtXqatiZma2TxxOylzbDHeKNTOz0cXhpMwdOb0B8DT2ZmY2ejiclLnmhlqm1te45cTMzEYNh5MxoG1Go0fsmJnZqOFwMgYcOb2B9jWb6ezuLXVVzMzM9srhZAxom95IZ08vj3RsLnVVzMzM9srhZAw4Kj9ix5d2zMxsFHA4GQNmT6mjpjLnETtmZjYqlCycSKqQ9EdJP07PD5V0l6R2Sd+XVJ3Ka9Lz9rR+dsExLknlSyWdXlC+MJW1S7p4uF/bSFNZkeOIaQ0esWNmZqNCKVtO/g54qOD5PwGXR8QcYD1wfio/H1ifyi9P2yGpDTgbOApYCHwjBZ4K4ArgDKANOCdtO6Ydd9BE7n5iPU+u3VrqqpiZme1RScKJpFnAq4BvpecCXg7clDa5DjgzLS9Kz0nrT0nbLwJuiIgdEfEY0A6ckH7aI+LRiOgEbkjbjmnvPulwKnPiYzffT0SUujpmZmYDKlXLyZeBDwP5sa1TgA0R0Z2erwBmpuWZwFMAaf3GtP3O8j77DFT+PJIukLRE0pKOjo6hvqYRbfqEcfz96fP49bIOfnzfqlJXx8zMbEDDHk4kvRpYExF3D/e5+4qIqyJifkTMb2pqKnV1iu7cl8zm6JkT+NSPHmTjNt8I0MzMRqZStJycCLxW0uNkl1xeDnwFmCipMm0zC1iZllcCBwGk9ROAtYXlffYZqHzMq8iJz73+aNZt2cEXbn241NUxMzPr17CHk4i4JCJmRcRssg6tv4yIvwLuAM5Km50H3JyWF6fnpPW/jKzTxGLg7DSa51CgFfgd8HugNY3+qU7nWDwML21UeMHMCbz9pYfy3bue5O4n1pe6OmZmZs8zkuY5+QhwkaR2sj4lV6fyq4Epqfwi4GKAiHgAuBF4EPgZ8N6I6En9Ui4EbiUbDXRj2taSi06by7TGWv7hv/5EV4+ntDczs5FFHrmRmT9/fixZsqTU1Rg2P3/gGS74zt1cfMYRvPsvDy91dczMbJhJujsi5pe6Hv0ZSS0nNoxOO2oap7W18OVfLOOpdZ77xMzMRg6HkzHsk689igqJT3juEzMzG0EcTsawGRPHcdFp87hjaQe3/OmZUlfHzMwMcDgZ8857ySG8YGYjn/zRAzy33XOfmJlZ6TmcjHGVFTk+97pjWLt5B1/42dJSV8fMzMzhxODoWRM476Wz+fe7nuAPT3ruEzMzKy2HEwPgg6fNo6Whlo/+wHOfmJlZaTmcGAD1NZV88rVH8fAzm/j2/z5W6uqYmdkY5nBiO51+VAunHtnC5bct99wnZmZWMg4ntpMkPrXoKCQ894mZmZWMw4ntZubEcVz0irncsbSDn97vuU/MzGz4OZzY87z9pbNpm97IJxd77hMzMxt+Dif2PJUVOT73+qPp2LyDf7nVc5+Ymdnwcjixfh170ETOe8lsrr/zCe55akOpq2NmZmOIw4kN6IOnzaW5oYZLfvAnnly7ld5ed5A1M7Piqyx1BWzkaqit4lOvPYp3//sfeNkX7qCuuoLWlgaOmNbAvPRzxLRGJtdVl7qqZmZWRuThopn58+fHkiVLSl2NEenBp5/jnqc2sPSZ53j4mU0sXb2JDVt3dZRtaqjJAkvLrsDS2lJPbVVFCWttZmZ7IunuiJhf6nr0xy0ntldtMxppm9G483lE0LFpRxZUntmUAstzfOfOJ9jRnU19nxPMnlLHCYdO5rXHzWDBoVPI5VSql2BmZqOIw4kNmiSaG2tpbqzlZXObdpb39AaPr92yM7A8vOo5fnTv09zw+6eY1ljLa4+bwWuPncFRMxqRHFTMzKx/vqyT+LJOcWzr7OEXD63m5ntW8qulHXT3BnOa61l07AwWHTeTg6eML3UVzczGpJF8WcfhJHE4Kb71Wzq55f5V3HzP0/zusXUAHH/wRBYdN5NXHTOdqfU1Ja6hmdnY4XAyCjicDK+VG7bxo3uf5od/XMnDz2yiIif+fM5UznzhDE5rm0Zdja84mpkVk8PJKOBwUjpLn9nEzfes5OZ7nmblhm3UVuV45dHT+cSr25g43sOUzcyKweFkFHA4Kb2I4O4n1vPDe1Zy4+9XMH1iLVefN585zQ2lrpqZWdkZyeHEM8TaiCGJ+bMn85kzj+Z7Fyxgy44eXnfF/3HH0jWlrpqZmQ0jhxMbkV50yCRuvvBEDpo8nvOv/T3f+s2juJXPzGxscDixEWvmxHHc9J6XcPpR0/jMTx7iwzfdx47unlJXy8zMiszhxEa08dWVXPGW43n/Ka38x90r+Kt/u4tnN+8odbXMzKyIhj2cSDpI0h2SHpT0gKS/S+WTJd0maXl6nJTKJemrktol3Sfp+IJjnZe2Xy7pvILyF0n6U9rnq/J0pKNaLicuesVcvnbOC/nTyo0s+vr/8tCq50pdLTMzK5JStJx0Ax+MiDZgAfBeSW3AxcDtEdEK3J6eA5wBtKafC4ArIQszwKXAi4ETgEvzgSZt866C/RYOw+uyInvNsTO46d0vpac3eMOV/8etDzxT6iqZmVkRDHs4iYhVEfGHtLwJeAiYCSwCrkubXQecmZYXAddH5k5goqTpwOnAbRGxLiLWA7cBC9O6xoi4M7IelNcXHMtGuaNnTWDxhSfS2tLA33znbq64o90dZc3MykxJ+5xImg28ELgLaImIVWnVM0BLWp4JPFWw24pUtqfyFf2U93f+CyQtkbSko6NjSK/Fhk9zYy3fv2ABZx43gy/cupS/u+Eetne5o6yZWbkoWTiRVA/8J/CBiNitA0Fq8Sj61+GIuCoi5kfE/Kampr3vYCNGbVUFl7/5OD50+jwW3/s0b/7X37L6ue2lrpaZmR0AJQknkqrIgsl3I+IHqXh1uiRDeszPvLUSOKhg91mpbE/ls/optzIjifeePIer3vYilq/ZzGu//j/ct2JDqatlZmZDVIrROgKuBh6KiC8VrFoM5EfcnAfcXFB+bhq1swDYmC7/3AqcJmlS6gh7GnBrWvecpAXpXOcWHMvK0GlHTeM/3/NSKnM53vjN3/L/vn8PV9zRzs/uX0X7mk10dveWuopmZjYIw35vHUl/DvwG+BOQ/6vxUbJ+JzcCBwNPAG+KiHUpYHydbMTNVuAdEbEkHeudaV+AyyLi26l8PnAtMA74KfC+2MsL9b11Rr9nN+/g0sUP8Icn1rNq465LPBU5ccjk8RzWVM+c5noOb6rj8OZ6Dm+qZ8K4qhLW2MysdEbyvXV847/E4aS8bN7RzWMdW2jv2MQja7bwSMdm2tds5vG1W+jq2fU7P7W+hjnNdRzeVM+MieOorshRVSGqKnNUVeSoSY9Vqby6IrdzXf55XU0lMyaOK+GrNTMbvJEcTipLXQGzYqivqeToWRM4etaE3cq7e3p5av02HlmzmfaOzTyyZjOPdGzmR/c+zXPbu/f7fIuOm8GnXnsUE8dXD7XqZmZjnsOJjSmVFTkOnVrHoVPrOHXnaHWICHZ099LV00tXT9DV00tn3+c9vXR1Z887e3ro7M7KH1r1HFf9+lF++8haPv+Go3n5ES17qIGZme2Nw4kZ2cif2qoKaqsqBr3va46dwSuPns4Hb7yXd167hDfNn8XHX91GQ637s5iZ7Q/f+M/sAHjBzAksft+JvOekw7np7hUs/PJv+N/2Z0tdLTOzUcnhxOwAqams4CMLj+Cm97yUmsocf/Wtu/jEzfeztXP/+7KYmY1FDidmB9jxB0/iJ+//C9554qFc/9snOOMrv2HJ4+tKXS0zs1HD4cSsCMZVV/CJ17RxwwUL6I3gjf/6Wy77yYO+B5CZ2T5wODErogWHTeGnf/cyzjnhYP7tN4/x6q/9D/c+5Sn2zcz2xOHErMjqayr57OuO5vp3nsCWHd28/sr/419+vtTT6puZDcDhxGyYvGxuEz/7wMs487iZfO2X7Sy64n+54+E1bNjaWeqqmZmNKJ6+PvH09TacbntwNZf84E88u3kHAIdMGc+xsyZyzKwJHHfQRI6aMYFx1YOfc8XMbF95+noz280r2lo4cc4U7nlyA/eu2Mi9T21gyePrWHzv00B2s8K5LQ0cO2sCxx6UhZa5LQ1UVbix08zKn1tOErec2EiwZtN27ntqI/eu2BVaNm7rAqCmMscLZk7gmFkTOHJaI3U1ldRW5aiprHjeY01Vjtr0WFOZI7u5t5nZLm45MbN90txQy6lttZzalt2fJyJ4ct1W7nlqA/et2Mh9Kzbwvd89yfauwXWmra7MUVuZo6aqguqKHBU5UVkhKnOiMpfdYTkry2Vl+cecqMpvnxO5nMgJctq1XCEhZfv3t66yIsf0CbUcPHk8h0ypo7mhhlzOYcnMBuZwYjaCSeKQKXUcMqWORcfNBLI7Kz+9YTvbunrY0d3Dju5etnf1sKOrl+3d2ePOsoLHHd09bO/qpbunl+7e7KaFPb1BV0/Q3Ztfzm54uLWzh+7eXrp7gu7e2LlPBPRG0NMb9Kbl/PMIUnlajl3LhWoqcxw0eTyHTB7PwVPGp9AynoMn1zFr0rj9ur+RmZUXhxOzUaayIsfBU8aXuhr7rLO7l6c3bOOJdVt5ct1Wnly7hSfXbeWJtVv57aNr2dq5a2I6CaY1Zq0sB08ez+ypdRzeVM+c5joOnlxHdaX73JiNBQ4nZlZU1ZU5Zk+tY/bUuuetiwjWbunkibVbeXLdFp5cu40n1m3hybVb+dWyDjruXrFz24qcOHjyeA5vygLLYenx8KZ6JtVVD+dLMrMiczgxs5KRxNT6GqbW1/CiQyY9b/3mHd082rGZRzo282jHFh7p2Mwja7bw62XP0tmzq9/NpPFVO4PK4c11zJo0noqcUDqHdp4v/SDSfzvrIQrW7ddryQJU1n8n33cn69Oz23K+r0/q15Pv02NmuzicmNmIVV9TyTGzJnLMrIm7lff0BivXb8vCSsdmHknB5faHV/P9JaNvUrv6mkomjq9icl01E8dXM3l8FRPHVzNpfDWT63YtT6qrSmXV7ptjZc3hxMxGnYqcss60U8Zz8hHNu63buLWLlRu20Zt64uY75AZZ59wgu5yU76ebrd+1bn/19mYdg7t7Y7fOxPmOxzs7F+9cl3Uy7uzuZeO2LjZs7WLdlk42bO3k8We3sH5LJ5t2dA94vtqqHHXVlam1ZleLzK7lXaOwqgpGYeVbdna1FGV2tjClAqE+67Oymqoc46oqGFddsfOxtqqC8fnnVRXUVu/+fFx1BVUVud1apvLHzg9zz9eHgjIbuxxOzKysTBhfxYTxVaWuxgHR1dPLhq1dbNjaybotnazPL2/tZMPWLrZ2dtPds2vEVXfBY1d+lFVPsLk7v10WiLLRVbsHtHx4yz/PHlPAK9hmR3cv2zqzEWDDoW+A2llesH5XmXZbub8RR4KqihzVFbmdYS+/XFWZBb/qtJwFvxzVBcv5y3j5UFhRIar2cEmvqkLk0pD8vq/vea+xn9e74LApTJtQu5+vdmRyODEzG6GqKnI0NdTQ1FBT6qo8T29vsL27h62dPWzr7GF7V1ruyn62d+56nh/Onhexe0tVPvjsWt61YvcWrrRM9FO2+3YxhHawiCwYdvX00tWdhbqu3qCrOyvrTKFvR1cvm7d305mCX1dPn4CYH66fgmJvkeY8vebt8x1OzMzMcjkxvrqS8dX+M7Kvegsu+3X1pMuAKfj0FiSX/oLY88t3aWkceeF1qPxbZWZmNgxyOVGdE9V4vp698TtkZmZmI4rDiZmZmY0oDidmZmY2ojicmJmZ2YjicGJmZmb7TNIpkiYoGWCbqvRYKamy7zpJe8wfZRtOJC2UtFRSu6SLS10fMzOzMlEPXAocA9wi6ccFPw9LegVwraRjgDembe6V9JCkW4FbgEP3dIKyHEosqQK4AngFsAL4vaTFEfFgaWtmZmY2ukXEzZJWAE8D3wR60qrngJcBXcDHgTcDXwLuBc5O2/8Y2BYRa/d0jnJtOTkBaI+IRyOiE7gBWFTiOpmZmY1qkt4s6d+Aw0h3Nuhns6r09/dzQCNwTvo5HPgU8MW9nacsW06AmcBTBc9XAC/uu5GkC4AL0tPNkpbux7mmAs8O435j5Zyjqa6lOOdoqmspzjma6lqKc46mupbinKOprkPZ95DB7hAR35e0Ejg9Iv5D0quBuWn12WQtJ5dK+gxwR0RcDHxc0mkR8SFJrwdeuLfzlGs42ScRcRVw1VCOIWlJRMwfrv3GyjlHU11Lcc7RVNdSnHM01bUU5xxNdS3FOUdTXYe67xDkL+VMjoiTJH2RXZniY0A38ApJ55KFp1skfQCoBbZJOjMifjjQwcs1nKwEDip4PiuVmZmZWXH07SoSwM1kl3KOKyi/JyI+u6cDlWs4+T3QKulQslByNvCW0lbJzMysLAiYK+kTwDRJvwBmA1f22e5MYDzwAuDUiOhOw4p/vLcTlGWH2IjoBi4EbgUeAm6MiAeKdLr9vSw0lMtJY+Gco6mupTjnaKprKc45mupainOOprqW4pyjqa5D3Xd/HAMcCVwLvD4iTo2IOWSjdNqAHcAkYBzwuf05gSL662hrZmZm9nySmoHtEfFcn/L3kLWUfAV4F/C7iLhb0q/I+qgEWauLIuLlezyHw4mZmZkVi6TqNK3Hvu/jcGJmZmYjSVn2ObGxKd3mYUz8TksaV+o6jFSSjs/f12OYz9vvPUbK5Zxp5u2SvM5yJ6m17/1nxrox8UE+EuT/Qac/oA2j6R/4YD/oJTXtxzny78/09A91UO+PpGmR6c1/iBaLpGMlDXryogNw3vx79GfAWwYTxCS9SNK8dLOu4/fnvEMxXKFR0kXAORHRNQznyqXHF0hqigPQDL0v73W6adrJkqYzTJ/haaKtxZJm7M/rlPQ6SXWj5XNvf+qZPtv3Z7/3A29IAzn26/yj5X0dDIeT4dMiqZ5sApp/Bf5GUl2J6zSggg/e44FXDmK/48k+xGoGc76IiPTN4SKyMfFv3df3R9KRwFKlGzxGRI+kSZKq97Lf/nyQ5MjuH/F+Sf8xlBaMgm+i0yWN39v26T2qJuv9fi8F/3739FpSHV8PvBq4Gjh2kFXN/y68qOCYe/zs6FufFBoHE6byQWzmIPZpI/td/cre6jPAuZr39fdWktJrqiJ7T1/Yd/1e9m/J/+5I+jtJC2Hn/+OB7vKak/Ry4C+BVuCrwLHD8Y07In4M/IrsZm5vKKjTvoSp1wNnkXUjGPZ+BIP5siTphZD9fygo29fPiWrgNElTB3G+NuAvI+Lzks6SdFLf8w+wX/7f5AmSji7F+1psDifD5xTgGuDDZDPnzQD+XtK0ktaqHwUfvNVkN23aNojdPwNMBHL78W35rcB64CPAUcBHJB2+D/t9APhv4OQUGA4FzgWa97TTYP9Bpw+pd5ENl7sKeGQw+/c5VkUKUZOB7++trgUuA24iuyXDDZLeIim3p9cSEdsi4h/I5iGYCzwn6ZB9+AM6K+3fI+kEshtoXpXq3run/fP1kfRXkq6RdHBE9KayvZ23Kv2RfjHwhUEE3X8FvhoRK9KX2FpJjZKm7OX9yQfjhcCf78uJCo53GXBLRPxcUr2kD6T3Z2+/W68FvqVsdEMrcKGky/e0b3r/ZpL9vm8G/hNYX/iNe08kHVawvM/BXFKbpL8A/g/4NHCupC9Jqt/T60zv/8eBdwNryG4Ch6TZkibt6/mHQtJHgM8pu0vu3radCnxd0rmS/lrSX8I+BYUFafFtwD8Dp+YDyj58Bn4ReFzSO8i+mL1KUpOkC/cUzAuC8XXARSpya3EpOJwMk4j4LvAAcGREnBsRnyCbIO6zBb/cI0W+NeCjwG/TB2+1pDfvaSdJHyMbJnYn2TCz3n05maR3SDoCWA1MAQ5O576L7P15kwZoWVDWJDoPuJ5sbpsZwOfJ/nh3FXw7rSnY532SzpHUsC/1y0sfUn8ELgE+BHwsIgYT3AqPlZ/6+XvAExHx+J62T3/0rgIaI+KbZK0DO4CjyULcwf3so4LlGWT30fgE2c23zgIWKGvN6+98pwBvk3RQOs4lZH9QHwfukPTigT60lV1Cqk3h5q+BZ4Av5b9x72G/d0l6VcElmc+SzVG0o/DDt7+WAkkfTa9vraSDyILKT4AfAN/q73xpvyPTvn9Pdk+Q9+/t96LgW+uZwJyI+FRa9XmyCR8/VxgEBnB92v5lZP9WXk32zft1kmZKetsAAeJm4OdkAeUM4IOSXpvqUyHp/P7+IEo6C/hrSQsk1fbXQtPf+ZRNsvUXZEH8bLL3+H1k4ejG/B/w/kTEdrIw9ROyYBOSZqfXvc+tC/tL0slk79G/Ay9Tupw5UDCLiGfJfsdfT9Y69RZJf7unICfpJWRh7QPAq8jC25Fkv/eQtWz1O2Q2vbergPvIPk9+AdxIFlKOjYh+ZzWXdLCyVuLLgd8AdwOTJE2V9FZlX3hGPYeTIpM0peCDdSvQIOkNkm4BHiP7h/MJSReWrJIF0i/2ZyV9lmyinbvTt44v0ucDpc8fjJOAlwBvImtZ2Kc//MrGyy8AqiPip8By4FTgfLIPtFnAe4Hn9ZOQ9FKySxW/IHsv30E2jv6W9HMZ0JMCTP5b2ziyJvgp7H6Lg331OHAbcMO+fmPtp975yznvBw6OiLel5/39Ucl/MD5NNsvxdyX9TSr7OtkfjVeStcK9uHC/9AdofAq/lwBXp3tZXEX2/+g04HT1f/nsUbL5Cs4ArgB+k5r2v0Q2C+Ql/YXq9G3vLWTfIr8JXBMRHwV+SvaHdE+XlJ4APinpHyT9E3BXRPwwvS/5lpi3Ayf3OedJZH9APwv8HTCZLCC/JSJOJWtFG8gyoJ7spmnfILt0MS0dd8IA4W1iKv8b0gRTkt6WjvMKsla816uf5n1Js5Q15f8FWUh8BGiT9OVU7/8m+4P+ooLWpwpJh0s6nSxsPQ58EPgd8FLgHyR9GjgJmDDAl4L7yS5Hnkn2u3IiMFPSEZKOknQp8KEUKPN1fXE6/rVkM3yuAl4DvJ+s9e5fgQ+ncz+PpL9Pr+8ospbBX5GFwHvI7q0y2MuL+0xSLdnv/F+TfXlZCFQqa7F5e9//r5Ler6xfzevIvpxdQNbaMyMf5CS9XulSccG/y6XAf5B9ocqRhYXvkIXMlwInAm/oJwi+jOwLwq3AG8i+9PyR7LPwpWS/W0h6laQJfV7eKuB0shD0rfQa30PWivJqstbnUc/hpIjSL+S5ZN9YLiK7t8BWsl/CLwB1EfFL4MvAOdpLH4nhEBHryPozvAv4F7I//O8n++BbnN9O0puAE9LyOLKWjs8DFWTN1GdI+v/bO/cgu6oqjf9WR8yDhEcAqdATojyCQCGkFLEA3wyMBGp4WfIKSECIARIBSZFCAtaEipRIpCIBeYUJjxGVEDQSBBokAkajDhPUiegEjAZ5CANFEoZJ4jd/fPukD4fbfc/tJqE7s7+qrtzce9bZ++yz91rfeuxztisd35X38X7g78CREXEGDqvfnc4zD3ha0sclPVoWSuebhiMlCzGJ+RgpaoPrVn6BPb2pwH3h6MsPsBFbQuebNFsZnxckzZf0YKuyqd9FOmc34DLgpNL3bzEqpSjDPsAi/NrxsdhgvI7vy73Ak9ggRkXuQqy8/ibpnoi4GV//AqwYP5POV+7jSJxi+wU2ZscDHRExAyvDxdh7/yAVJG9vDjZiw4FlyVAchlMu/9HV2Ei6P11PO/BZ7EWCaxX+Hi4Enow9xaKvw4CLgctSRGl/YDzwIjaAIem1btpcjwnxKmBHbMgWRsR1eFx3iYjxiUSTjNqNaWzukbQkHCU5BROGizExb8cEpIpjMIlejEnW8TgScT0mNwOxXt4rOqOFQzHZOwdYkX6fANyRrvUenCqeBAxN4129zmV4PS/HDsQ1+B5/G7gIOyJ/AV5K1zkEE9Ghqb3z8P0+Fxvk6cAvgX+hUh8WLnwdi+fqPBxNHYodhNE44jefHrwRtwVcmdpYjefM1tjwjwNOkrQq9bWIwv0Uz5nJOPrxD9jZ+s8UgdkOR1NGw5vW16ckPYx1+pZ0RuDmSHoc69IP0BmNJo3VJDx2h+E0+ON4zhyOH1Z2YjjlNwGPXyE7AhP1qzEJGYZJ3314PP8GfCI2g918mZxsRMiYiSfcQXhBB1YKr0gqjP3+wFy1+JCajYg9sBe7Dhfw7oDrIs6JiE8nz+8s/GoAsAFcLOmnmBjMwgTlrGRQGobykwLcE8tdZkEAAAsVSURBVBub3+Nw5uuYGAl7pX+IiOHVqEI63wnAz7DSPRdHF57EJPAzQAdWUl+S9AJWHk9iIvVXdfNGzI2BZCiLdM6dwNWSfh2uGVnfjdwk7GWfhefRWmxIPgG8S9IMTHqXlMc5Ik7Gxm4KMDIiJmIP6xZM+hbjcV5TafJ1fO+fwqmGa5LcEmCcpKeBicDLlX6Wd3UNA74DHADcDrwm6TvFOHRxnQMkrZY0ERv5myPigtLYTAe+mgh0gaOAX0r6efr/Ihwh+CQmZbtHg91m4TTle8JvTF2OidSp6ZqEo2Onpu9nk97AmozaGTjCcXBEbIPfwHo1vkf7pOv8NI3fXXYjNkKT0/gswcZwDDaaf8Epm1mS1qQ2X8Wk5A1gYZq37ZiYbJPaWYD1ytUpnVK93pC0Vn4T+1RMLK4D/knSOEnHSrqjlKY8BRu9J3AdxWJswAsH4BJs8P8ITJK0utTcB7EO+AlwBSY8IzHhvBjP2xeAVY3uTW8REScA20majcf4j8DncaRwf+DVcCHpZcDhKUI0AuuzJ7D+uQ0Tm1ewA3Q+Ht+VqY1IBOC0iFiA1+ZU4IdYpxUF4FOBmcW9TBiHnb4HgQ/h8dwej+mDONV7KtaLZ0p6pWgT67kv4fHrwPdxACaul2J9dxqOoPRr5Iew9RKFJ9zFb23J4+vAaYcOHMr7Ng7d/RZP5ncDq5sYqKgYnpYr3+vIhHOnH8Ks/gLsEayVdGM4d3oY9gRmJk+XiNi36hEnUvIRTGyWSPphF+3tJOnZ5KkehA3Fk3R6pcMrxqiQa1NngeUhOALxKzyea/ECPRhYJum8iuwBwMpkCDYZSvPha8DBkg5O33d7XyJiP+AlSX+OiLuA92FjOQgbl5URMbhkWApF9o/YI9sNG7+J2IgtxQr3WmBUIhuN2p0JvCBpRjjNcr+kjvTbxyU90kCmMJRTJC0NFySehovBO/D9WF2Vq45R+rwz3pGyhkQaJJ1bOb66LiZjD3gBJlTD0xg8UDYQKfJ3TOrTHKCtcA4iYjrwiKQHIuIRYLakOxv07wycfvwZnYZpqvy47re8Dr7oa3jX02xci7MCk5ML8P3839SXL5fktky/P4XX0/rU94fwmnkIR2EbplcatL8FJvR3S3o4UpqxrH8iYkdJz4cL9n+Oydf+uKj6yzhNsqTkYJXbGYUjQRfhNTg/Xe916XovxVGfs4FTy+uwtEZ2lPR8d9fTxTVuhSOv56WxmQj8GXgUk4LbMBFox3ri+/hdMK+m/m1FSgGn73bFpGxP4Bk59Vxt83KcyjwzzZn98Br7CrC9pEmV40dI+mtEnI4J0zdx2noGdiROxmnjbzVoa0scFf5v4DFMlv4NO2bPpXMdgwn7fa2MXV9DJic9QFp8oyQtqnn8QKyAzgXGJoVwDTYQCyTdVPM8AzBxeELSGy30t0sC1eDYMdgQrkik6gFJXyv9PhN4VtLXG8i+KaUQDrlfAawpK9sGcjvjKMjN2GD+AHtaS7siNUV7SdkOxSmlDqwMt8Le5NPA5XWvfVMgEaNFwK7yrpLa9ybJT8YKvth2fZOkhgWfpfG5BCvLMTjdMhMr2kcbyZXkj5Z0d/r8WWA/eddPdzI7YZI4XtLCiJiHDUBbav8xSfNqXGeZBNyEScZxzcYqnMv/pFKRaur3hzFZUuXYQZL+JyKuwsZnnqQb0v/n4ejL9pImV+SKcQ3gQEmPRcSVeF1c1eza0jmuwJ75jtizvwPP3X1xdGh95fgtJK2NiGtxJHYaTkftgY3UgZJuqdN2Ot/JeA5+tclx7ThycguOPizHOmgl8EV18TyZcLRvOfbsx+O0zjQ8rhNIqSlJX2kguy9eI3tKerbuNSXZIdjAb536eRUmdbMwYV2BI6jzsUO4LsmNwORlGK4ZeQanccZiR+xPDdoq5kFRazcaE6FzMDG6FDhUXdSmhdP4H8PzYAomnvNxFOcGORVXPX401nXr6KxxWYoJ8ihMXP+rFfvQZyEp/7XwhyfE53CB2fXYY6krexSefFfhfP/OdeVTux/FHtTRLbTZjr2IKzCj3qIF2UlYCV0JbIFTNXcBA2rKB94WfB4wuMlxg9LnudjbKQoc22u0sy3w/vR5OiYp25DId1/6w57S2PS51jhW5MfhMDq4DmUuMLoF+Ut6Mi7Yu1xc3K8mxw5Ka2M5MK303Thgl5rttaV/hwAPAx+pKXck9tKj9N3Abo7fCRuRXXD66Rt0hujvxWmzZm2ejp2MDfO5hsx7cZHoAOy03AiMqXmN70v9m45TqofW6We5b5iwLap5PwfiVOlpmMTNwhHgtm5kBqR/x+CasMG4xupxbLgDGNpVH4FD0ufD6uiAYq6UPh+Ao27H4wjaM5gcnUDSn6U51oZJ3vg0fy4EbsARoinA2U3a3YGkx3FU41eYCH20Zr+n4Ygb2LGai4ljw2vEdXpDcaHxi2ke7FOnrf70lyMnPUDKNR6C2etP1MJ20nAF/7XA7pL2K3uITeTacK5xAPBjSUtrtrc1ViwnAA9J+m4TkbLsKXjRPo+r2J/DXt2ybgXffI7t8MJdUfP4I3DNwOPAtpJerClXeDHDcZjzROBlbWYTPCov0Cq8/xpyxfjcCtyuFkO+ySO9E8+j1XXGNbxb4XVJ/95KW5Vz3IFTQd2mLCoy75K0rk4aMx2/ISWWIhrHYcP0G0l/aCIb2Pv9vaTn6q7nBuc5Aq+TO2sefxA2ZIPwTrdaciX5kdhhOQnqPfMnpex2xU7LrXLNUjOZI4DXMOE4H5PWNZjoVmudqrKDcUTgcknfa3LsTvi+vYTJzzdwiux0TJAuS32YBfxO0hcq8sX6GIyjV8fgqMk84LdK9VLdtL9hrkXaRi5peXcyJdldimMj4rbU3oxm7WHCNwfX670HR6n6Uu1ir5DJSS+QDO8q9SCEFhH/LOmeFmWGAHtLWtJqe0m+liErHb819sheSqHLPdRC6LgnSCRsqCqv4m5BfgCO1Mxupvz+PyIi9gHekPRUi3JtOFXT5TNDupDbkAJplShGxO7A6ZIuakWuJyjVOuyNH3//lnRDjXNssiegpnk+ClihHmxpT6nmEyXNaUEmcP3SgZL+tcX2hmEOtKpFuVpvs42II3H6Zjqukzq79NsEvF33fEyU3t3d/E9zfS9ghFxDMrCuju/NHAjXAh3bjAhVZM7EUbddcQT1Rz1puy8ik5OMbrEpFW7GpkFRv/BO96MO6hqnt7G9HYAjWjHa/RU9Xdut1klVZHsUWWpyzs/hWot5OIV0LC62fU3esUP49QCHA5dKqvUckCIC93b29e1Gb+5FX0feSpzRLTIx2fzQX4gJwKYOUac04q2bss13Cj1d270xhhuBmAzEqZ/PA8twqmpvvFNtZHQ+GXoQ3oZd+wFlfZ2YQO/uRV9HJicZGRkZJfQHo5SxIc10ES6I78DbadvxzsjjcRHs0RFxKJ0PXszoJ9job7PMyMjIyMh4u5Fqma7FW+Pfi7fYTsFP+x2NdyR9AG/TniC/OyejnyCTk4yMjIyMfocUOXkZv2vmdkn3RsTh2K7thh9/cL2kG97Bbmb0EJmcZGRkZGT0O6SamXURcWGpfmYhfvDZ3IhYT3pXUEb/Q645ycjIyMjot6gU9v4GGJKiKl/AdSgZ/RCZnGRkZGRkbC5Yid+J8ymc6qn1sMqMvof8nJOMjIyMjM0GEdEuaeU73Y+M3iGTk4yMjIyMjIw+hZzWycjIyMjIyOhTyOQkIyMjIyMjo08hk5OMjIyMjIyMPoVMTjIyMjIyMjL6FDI5ycjIyMjIyOhT+D/t9A87TA+J8QAAAABJRU5ErkJggg==\n"
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-01 09:53\n",
    "# 《自然语言处理入门》2.1.2 词的性质——齐夫定律\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import os\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')\n",
    "msr = os.path.join(sighan05, 'training', 'msr_training.utf8')\n",
    "\n",
    "f = Counter()\n",
    "with open(msr, encoding='utf-8') as src:\n",
    "    for line in src:\n",
    "        line = line.strip()\n",
    "        for word in line.split('  '):\n",
    "            # word = word.strip()\n",
    "            # if len(word) < 2: continue\n",
    "            f[word] += 1\n",
    "\n",
    "\n",
    "def plot(token_counts, title='MSR语料库词频统计', ylabel='词频'):\n",
    "    import matplotlib\n",
    "    from matplotlib import pyplot as plt\n",
    "    fontpath = ensure_data('SimHei.ttf', 'https://github.com/StellarCN/scp_zh/raw/master/fonts/SimHei.ttf')\n",
    "    myfont = matplotlib.font_manager.FontProperties(fname=fontpath)\n",
    "    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "    fig = plt.figure(\n",
    "        figsize=(8, 6)\n",
    "    )\n",
    "    ax = fig.add_subplot(111)\n",
    "    token_counts = list(zip(*token_counts))\n",
    "    num_elements = np.arange(len(token_counts[0]))\n",
    "    top_offset = max(token_counts[1]) + len(str(max(token_counts[1])))\n",
    "    ax.set_title(title,fontproperties=myfont)\n",
    "    ax.set_xlabel('词语',fontproperties=myfont)\n",
    "    ax.set_ylabel(ylabel,fontproperties=myfont)\n",
    "    ax.xaxis.set_label_coords(1.05, 0.015)\n",
    "    ax.set_xticks(num_elements)\n",
    "    ax.set_xticklabels(token_counts[0], rotation=55, verticalalignment='top',fontproperties=myfont)\n",
    "    ax.set_ylim([0, top_offset])\n",
    "    ax.set_xlim([-1, len(token_counts[0])])\n",
    "    rects = ax.plot(num_elements, token_counts[1], linewidth=1.5)\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "word_freq = f.most_common(30)\n",
    "print(word_freq)\n",
    "plot(word_freq)\n",
    "# log_word_freq = []\n",
    "# for w, f in word_freq:\n",
    "#     log_word_freq.append((w, np.log(f)))\n",
    "# plot(log_word_freq, ylabel='词频的对数')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/demo_dat_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[江西, 鄱阳湖, 干枯, ，, 中国, 最, 大, 淡水湖, 变成, 大, 草原]\n",
      "[上海市, 虹口区, 大连西路, 5, 5, 0, 号, S, I, S, U]\n",
      "[上海市/ns, 虹口区/ns, 大连西路/ns, 550/m, 号/q, SISU/nx]\n",
      "单词:上海市 词性:ns\n",
      "单词:虹口区 词性:ns\n",
      "单词:大连西路 词性:ns\n",
      "单词:550 词性:m\n",
      "单词:号 词性:q\n",
      "单词:SISU 词性:nx\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-29 09:48\n",
    "# 《自然语言处理入门》2.8 HanLP 的词典分词实现\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from pyhanlp.static import HANLP_DATA_PATH\n",
    "dict1 = HANLP_DATA_PATH + \"/dictionary/CoreNatureDictionary.mini.txt\"\n",
    "\n",
    "HanLP.Config.ShowTermNature = False\n",
    "segment = DoubleArrayTrieSegment(dict1)\n",
    "print(segment.seg('江西鄱阳湖干枯，中国最大淡水湖变成大草原'))\n",
    "\n",
    "dict2 = HANLP_DATA_PATH + \"/dictionary/custom/上海地名.txt ns\"\n",
    "segment = DoubleArrayTrieSegment([dict1, dict2])\n",
    "print(segment.seg('上海市虹口区大连西路550号SISU'))\n",
    "\n",
    "segment.enablePartOfSpeechTagging(True)\n",
    "HanLP.Config.ShowTermNature = True\n",
    "print(segment.seg('上海市虹口区大连西路550号SISU'))\n",
    "\n",
    "for term in segment.seg('上海市虹口区大连西路550号SISU'):\n",
    "    print(\"单词:%s 词性:%s\" % (term.word, term.nature))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/backward_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['研究', '生命', '起源']\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-22 21:05\n",
    "# 《自然语言处理入门》2.3.3 逆向最长匹配\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tests.book.ch02.utility import load_dictionary\n",
    "\n",
    "\n",
    "def backward_segment(text, dic):\n",
    "    word_list = []\n",
    "    i = len(text) - 1\n",
    "    while i >= 0:                                   # 扫描位置作为终点\n",
    "        longest_word = text[i]                      # 扫描位置的单字\n",
    "        for j in range(0, i):                       # 遍历[0, i]区间作为待查询词语的起点\n",
    "            word = text[j: i + 1]                   # 取出[j, i]区间作为待查询单词\n",
    "            if word in dic:\n",
    "                if len(word) > len(longest_word):   # 越长优先级越高\n",
    "                    longest_word = word\n",
    "                    break\n",
    "        word_list.insert(0, longest_word)           # 逆向扫描，所以越先查出的单词在位置上越靠后\n",
    "        i -= len(longest_word)\n",
    "    return word_list\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    dic = load_dictionary()\n",
    "\n",
    "    print(backward_segment('研究生命起源', dic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/speed_benchmark.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "由于JPype调用开销巨大，以下速度显著慢于原生Java\n",
      "59.70 万字/秒\n",
      "62.65 万字/秒\n",
      "28.66 万字/秒\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-25 10:21\n",
    "# 《自然语言处理入门》2.3.5 速度评测\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import time\n",
    "\n",
    "from tests.book.ch02.backward_segment import backward_segment\n",
    "from tests.book.ch02.bidirectional_segment import bidirectional_segment\n",
    "from tests.book.ch02.forward_segment import forward_segment\n",
    "from tests.book.ch02.utility import load_dictionary\n",
    "\n",
    "\n",
    "def evaluate_speed(segment, text, dic):\n",
    "    start_time = time.time()\n",
    "    for i in range(pressure):\n",
    "        segment(text, dic)\n",
    "    elapsed_time = time.time() - start_time\n",
    "    print('%.2f 万字/秒' % (len(text) * pressure / 10000 / elapsed_time))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    text = \"江西鄱阳湖干枯，中国最大淡水湖变成大草原\"\n",
    "    pressure = 10000\n",
    "    dic = load_dictionary()\n",
    "\n",
    "    print('由于JPype调用开销巨大，以下速度显著慢于原生Java')\n",
    "    evaluate_speed(forward_segment, text, dic)\n",
    "    evaluate_speed(backward_segment, text, dic)\n",
    "    evaluate_speed(bidirectional_segment, text, dic)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/aho_corasick.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2:3]=he\n",
      "[1:3]=she\n",
      "[2:5]=hers\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-28 15:22\n",
    "# 《自然语言处理入门》2.6 AC 自动机\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "\n",
    "def classic_demo():\n",
    "    words = [\"hers\", \"his\", \"she\", \"he\"]\n",
    "    Trie = JClass('com.hankcs.hanlp.algorithm.ahocorasick.trie.Trie')\n",
    "    trie = Trie()\n",
    "    for w in words:\n",
    "        trie.addKeyword(w)\n",
    "\n",
    "    for emit in trie.parseText(\"ushers\"):\n",
    "        print(\"[%d:%d]=%s\" % (emit.getStart(), emit.getEnd(), emit.getKeyword()))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    classic_demo()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/fully_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['商', '商品', '品', '和', '和服', '服', '服务', '务']\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-22 21:05\n",
    "# 《自然语言处理入门》2.3.1 完全切分\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tests.book.ch02.utility import load_dictionary\n",
    "\n",
    "\n",
    "def fully_segment(text, dic):\n",
    "    word_list = []\n",
    "    for i in range(len(text)):                  # i 从 0 到text的最后一个字的下标遍历\n",
    "        for j in range(i + 1, len(text) + 1):   # j 遍历[i + 1, len(text)]区间\n",
    "            word = text[i:j]                    # 取出连续区间[i, j]对应的字符串\n",
    "            if word in dic:                     # 如果在词典中，则认为是一个词\n",
    "                word_list.append(word)\n",
    "    return word_list\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    dic = load_dictionary()\n",
    "\n",
    "    print(fully_segment('商品和服务', dic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/trie.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-25 17:25\n",
    "# 《自然语言处理入门》2.4 字典树\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "class Node(object):\n",
    "    def __init__(self, value) -> None:\n",
    "        self._children = {}\n",
    "        self._value = value\n",
    "\n",
    "    def _add_child(self, char, value, overwrite=False):\n",
    "        child = self._children.get(char)\n",
    "        if child is None:\n",
    "            child = Node(value)\n",
    "            self._children[char] = child\n",
    "        elif overwrite:\n",
    "            child._value = value\n",
    "        return child\n",
    "\n",
    "\n",
    "class Trie(Node):\n",
    "    def __init__(self) -> None:\n",
    "        super().__init__(None)\n",
    "\n",
    "    def __contains__(self, key):\n",
    "        return self[key] is not None\n",
    "\n",
    "    def __getitem__(self, key):\n",
    "        state = self\n",
    "        for char in key:\n",
    "            state = state._children.get(char)\n",
    "            if state is None:\n",
    "                return None\n",
    "        return state._value\n",
    "\n",
    "    def __setitem__(self, key, value):\n",
    "        state = self\n",
    "        for i, char in enumerate(key):\n",
    "            if i < len(key) - 1:\n",
    "                state = state._add_child(char, None, False)\n",
    "            else:\n",
    "                state = state._add_child(char, value, True)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    trie = Trie()\n",
    "    # 增\n",
    "    trie['自然'] = 'nature'\n",
    "    trie['自然人'] = 'human'\n",
    "    trie['自然语言'] = 'language'\n",
    "    trie['自语'] = 'talk\tto oneself'\n",
    "    trie['入门'] = 'introduction'\n",
    "    assert '自然' in trie\n",
    "    # 删\n",
    "    trie['自然'] = None\n",
    "    assert '自然' not in trie\n",
    "    # 改\n",
    "    trie['自然语言'] = 'human language'\n",
    "    assert trie['自然语言'] == 'human language'\n",
    "    # 查\n",
    "    assert trie['入门'] == 'introduction'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/forward_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['就读', '北京大学']\n",
      "['研究生', '命', '起源']\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-22 21:05\n",
    "# 《自然语言处理入门》2.3.2 正向最长匹配\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tests.book.ch02.utility import load_dictionary\n",
    "\n",
    "\n",
    "def forward_segment(text, dic):\n",
    "    word_list = []\n",
    "    i = 0\n",
    "    while i < len(text):\n",
    "        longest_word = text[i]                      # 当前扫描位置的单字\n",
    "        for j in range(i + 1, len(text) + 1):       # 所有可能的结尾\n",
    "            word = text[i:j]                        # 从当前位置到结尾的连续字符串\n",
    "            if word in dic:                         # 在词典中\n",
    "                if len(word) > len(longest_word):   # 并且更长\n",
    "                    longest_word = word             # 则更优先输出\n",
    "        word_list.append(longest_word)              # 输出最长词\n",
    "        i += len(longest_word)                      # 正向扫描\n",
    "    return word_list\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    dic = load_dictionary()\n",
    "\n",
    "    print(forward_segment('就读北京大学', dic))\n",
    "    print(forward_segment('研究生命起源', dic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/demo_acdat_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[江西, 鄱阳湖, 干枯, ，, 中国, 最大, 淡水湖, 变成, 大草原]\n"
     ]
    }
   ],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-05-29 13:51\n",
    "# 《自然语言处理入门》2.8 HanLP 的词典分词实现\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "\n",
    "HanLP.Config.ShowTermNature = False\n",
    "segment = JClass('com.hankcs.hanlp.seg.Other.AhoCorasickDoubleArrayTrieSegment')(HanLP.Config.CoreDictionaryPath)\n",
    "print(segment.seg(\"江西鄱阳湖干枯，中国最大淡水湖变成大草原\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch02/demo_stopwords.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-04 11:17\n",
    "# 《自然语言处理入门》2.10.1 停用词过滤\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from jpype import JString\n",
    "\n",
    "from pyhanlp import *\n",
    "\n",
    "\n",
    "def load_from_file(path):\n",
    "    \"\"\"\n",
    "    从词典文件加载DoubleArrayTrie\n",
    "    :param path: 词典路径\n",
    "    :return: 双数组trie树\n",
    "    \"\"\"\n",
    "    map = JClass('java.util.TreeMap')()  # 创建TreeMap实例\n",
    "    with open(path, encoding='utf-8') as src:\n",
    "        for word in src:\n",
    "            word = word.strip()  # 去掉Python读入的\\n\n",
    "            map[word] = word\n",
    "    return JClass('com.hankcs.hanlp.collection.trie.DoubleArrayTrie')(map)\n",
    "\n",
    "\n",
    "def load_from_words(*words):\n",
    "    \"\"\"\n",
    "    从词汇构造双数组trie树\n",
    "    :param words: 一系列词语\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    map = JClass('java.util.TreeMap')()  # 创建TreeMap实例\n",
    "    for word in words:\n",
    "        map[word] = word\n",
    "    return JClass('com.hankcs.hanlp.collection.trie.DoubleArrayTrie')(map)\n",
    "\n",
    "\n",
    "def remove_stopwords_termlist(termlist, trie):\n",
    "    return [term.word for term in termlist if not trie.containsKey(term.word)]\n",
    "\n",
    "\n",
    "def replace_stropwords_text(text, replacement, trie):\n",
    "    searcher = trie.getLongestSearcher(JString(text), 0)\n",
    "    offset = 0\n",
    "    result = ''\n",
    "    while searcher.next():\n",
    "        begin = searcher.begin\n",
    "        end = begin + searcher.length\n",
    "        if begin > offset:\n",
    "            result += text[offset: begin]\n",
    "        result += replacement\n",
    "        offset = end\n",
    "    if offset < len(text):\n",
    "        result += text[offset:]\n",
    "    return result\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    HanLP.Config.ShowTermNature = False\n",
    "    trie = load_from_file(HanLP.Config.CoreStopWordDictionaryPath)\n",
    "    text = \"停用词的意义相对而言无关紧要吧。\"\n",
    "    segment = DoubleArrayTrieSegment(HanLP.Config.CoreDictionaryPath)\n",
    "    termlist = segment.seg(text)\n",
    "    print(\"分词结果：\", termlist)\n",
    "    print(\"分词结果去除停用词：\", remove_stopwords_termlist(termlist, trie))\n",
    "    trie = load_from_words(\"的\", \"相对而言\", \"吧\")\n",
    "    print(\"不分词去掉停用词\", replace_stropwords_text(text, \"**\", trie))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## 第03章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/ngram_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-06 13:19\n",
    "# 《自然语言处理入门》3.3 训练\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from jpype import JString\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.demo_corpus_loader import my_cws_corpus\n",
    "from tests.book.ch03.msr import msr_model\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "NatureDictionaryMaker = SafeJClass('com.hankcs.hanlp.corpus.dictionary.NatureDictionaryMaker')\n",
    "CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')\n",
    "WordNet = JClass('com.hankcs.hanlp.seg.common.WordNet')\n",
    "Vertex = JClass('com.hankcs.hanlp.seg.common.Vertex')\n",
    "ViterbiSegment = JClass('com.hankcs.hanlp.seg.Viterbi.ViterbiSegment')\n",
    "DijkstraSegment = JClass('com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment')\n",
    "CoreDictionary = LazyLoadingJClass('com.hankcs.hanlp.dictionary.CoreDictionary')\n",
    "Nature = JClass('com.hankcs.hanlp.corpus.tag.Nature')\n",
    "\n",
    "\n",
    "def train_bigram(corpus_path, model_path):\n",
    "    sents = CorpusLoader.convert2SentenceList(corpus_path)\n",
    "    for sent in sents:\n",
    "        for word in sent:\n",
    "            if word.label is None:\n",
    "                word.setLabel(\"n\")\n",
    "    maker = NatureDictionaryMaker()\n",
    "    maker.compute(sents)\n",
    "    maker.saveTxtTo(model_path)  # tests/data/my_cws_model.txt\n",
    "\n",
    "\n",
    "def load_bigram(model_path, verbose=True, ret_viterbi=True):\n",
    "    HanLP.Config.CoreDictionaryPath = model_path + \".txt\"  # unigram\n",
    "    HanLP.Config.BiGramDictionaryPath = model_path + \".ngram.txt\"  # bigram\n",
    "    # 以下部分为兼容新标注集，不感兴趣可以跳过\n",
    "    HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = model_path + \".tr.txt\"  # 词性转移矩阵，分词时可忽略\n",
    "    if model_path != msr_model:\n",
    "        with open(HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath, encoding='utf-8') as src:\n",
    "            for tag in src.readline().strip().split(',')[1:]:\n",
    "                Nature.create(tag)\n",
    "    CoreBiGramTableDictionary = SafeJClass('com.hankcs.hanlp.dictionary.CoreBiGramTableDictionary')\n",
    "    CoreDictionary.getTermFrequency(\"商品\")\n",
    "    # 兼容代码结束\n",
    "    if verbose:\n",
    "        print(CoreDictionary.getTermFrequency(\"商品\"))\n",
    "        print(CoreBiGramTableDictionary.getBiFrequency(\"商品\", \"和\"))\n",
    "        sent = '商品和服务'\n",
    "        # sent = '货币和服务'\n",
    "        wordnet = generate_wordnet(sent, CoreDictionary.trie)\n",
    "        print(wordnet)\n",
    "        print(viterbi(wordnet))\n",
    "    return ViterbiSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(\n",
    "        False) if ret_viterbi else DijkstraSegment().enableAllNamedEntityRecognize(False).enableCustomDictionary(False)\n",
    "\n",
    "\n",
    "def generate_wordnet(sent, trie):\n",
    "    \"\"\"\n",
    "    生成词网\n",
    "    :param sent: 句子\n",
    "    :param trie: 词典（unigram）\n",
    "    :return: 词网\n",
    "    \"\"\"\n",
    "    searcher = trie.getSearcher(JString(sent), 0)\n",
    "    wordnet = WordNet(sent)\n",
    "    while searcher.next():\n",
    "        wordnet.add(searcher.begin + 1,\n",
    "                    Vertex(sent[searcher.begin:searcher.begin + searcher.length], searcher.value, searcher.index))\n",
    "    # 原子分词，保证图连通\n",
    "    vertexes = wordnet.getVertexes()\n",
    "    i = 0\n",
    "    while i < len(vertexes):\n",
    "        if len(vertexes[i]) == 0:  # 空白行\n",
    "            j = i + 1\n",
    "            for j in range(i + 1, len(vertexes) - 1):  # 寻找第一个非空行 j\n",
    "                if len(vertexes[j]):\n",
    "                    break\n",
    "            wordnet.add(i, Vertex.newPunctuationInstance(sent[i - 1: j - 1]))  # 填充[i, j)之间的空白行\n",
    "            i = j\n",
    "        else:\n",
    "            i += len(vertexes[i][-1].realWord)\n",
    "\n",
    "    return wordnet\n",
    "\n",
    "\n",
    "def viterbi(wordnet):\n",
    "    nodes = wordnet.getVertexes()\n",
    "    # 前向遍历\n",
    "    for i in range(0, len(nodes) - 1):\n",
    "        for node in nodes[i]:\n",
    "            for to in nodes[i + len(node.realWord)]:\n",
    "                to.updateFrom(node)  # 根据距离公式计算节点距离，并维护最短路径上的前驱指针from\n",
    "    # 后向回溯\n",
    "    path = []  # 最短路径\n",
    "    f = nodes[len(nodes) - 1].getFirst()  # 从终点回溯\n",
    "    while f:\n",
    "        path.insert(0, f)\n",
    "        f = f.getFrom()  # 按前驱指针from回溯\n",
    "    return [v.realWord for v in path]\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    corpus_path = my_cws_corpus()\n",
    "    model_path = os.path.join(test_data_path(), 'my_cws_model')\n",
    "    train_bigram(corpus_path, model_path)\n",
    "    load_bigram(model_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/japanese_segment.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-07 18:37\n",
    "# 《自然语言处理入门》3.6.1 日语分词语料\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import os\n",
    "\n",
    "from tests.book.ch03.ngram_segment import train_bigram, load_bigram\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "jp_corpus = ensure_data('jpcorpus',\n",
    "                        'http://file.hankcs.com/corpus/jpcorpus.zip')\n",
    "jp_bigram = os.path.join(jp_corpus, 'jp_bigram')\n",
    "jp_corpus = os.path.join(jp_corpus, 'ja_gsd-ud-train.txt')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    train_bigram(jp_corpus, jp_bigram)  # 训练\n",
    "    segment = load_bigram(jp_bigram, verbose=False)  # 加载\n",
    "    print(segment.seg('自然言語処理入門という本が面白いぞ！'))  # 日语分词"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/adjust_model.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-08 15:34\n",
    "# 《自然语言处理入门》3.5.3 调整模型\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import HanLP\n",
    "from tests.book.ch03.msr import msr_model\n",
    "from tests.book.ch03.ngram_segment import load_bigram, CoreDictionary\n",
    "\n",
    "segment = load_bigram(model_path=msr_model, verbose=False, ret_viterbi=False)\n",
    "assert CoreDictionary.contains(\"管道\")\n",
    "text = \"北京输气管道工程\"\n",
    "HanLP.Config.enableDebug()\n",
    "print(segment.seg(text))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/demo_custom_dict.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-07 14:48\n",
    "# 《自然语言处理入门》3.4.5 与用户词典的集成\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "ViterbiSegment = SafeJClass('com.hankcs.hanlp.seg.Viterbi.ViterbiSegment')\n",
    "\n",
    "segment = ViterbiSegment()\n",
    "sentence = \"社会摇摆简称社会摇\"\n",
    "segment.enableCustomDictionary(False)\n",
    "print(\"不挂载词典：\", segment.seg(sentence))\n",
    "CustomDictionary.insert(\"社会摇\", \"nz 100\")\n",
    "segment.enableCustomDictionary(True)\n",
    "print(\"低优先级词典：\", segment.seg(sentence))\n",
    "segment.enableCustomDictionaryForcing(True)\n",
    "print(\"高优先级词典：\", segment.seg(sentence))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/eval_bigram_cws.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-07 15:25\n",
    "# 《自然语言处理入门》3.5.1 标准化评测\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.msr import msr_dict, msr_train, msr_model, msr_test, msr_output, msr_gold\n",
    "from tests.book.ch03.ngram_segment import train_bigram, load_bigram\n",
    "\n",
    "CWSEvaluator = SafeJClass('com.hankcs.hanlp.seg.common.CWSEvaluator')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    train_bigram(msr_train, msr_model)  # 训练\n",
    "    segment = load_bigram(msr_model)  # 加载\n",
    "    result = CWSEvaluator.evaluate(segment, msr_test, msr_output, msr_gold, msr_dict)  # 预测打分\n",
    "    print(result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/sighan05_statistics.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-05 18:03\n",
    "# 《自然语言处理入门》3.2.4 语料库统计\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import re\n",
    "from collections import Counter\n",
    "\n",
    "import os\n",
    "\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "\n",
    "def count_corpus(train_path: str, test_path: str):\n",
    "    train_counter, train_freq, train_chars = count_word_freq(train_path)\n",
    "    test_counter, test_freq, test_chars = count_word_freq(test_path)\n",
    "    test_oov = sum(test_counter[w] for w in (test_counter.keys() - train_counter.keys()))\n",
    "    return train_chars / 10000, len(\n",
    "        train_counter) / 10000, train_freq / 10000, train_chars / train_freq, test_chars / 10000, len(\n",
    "        test_counter) / 10000, test_freq / 10000, test_chars / test_freq, test_oov / test_freq * 100\n",
    "\n",
    "\n",
    "def count_word_freq(train_path):\n",
    "    f = Counter()\n",
    "    with open(train_path, encoding='utf-8') as src:\n",
    "        for line in src:\n",
    "            for word in re.compile(\"\\\\s+\").split(line.strip()):\n",
    "                f[word] += 1\n",
    "    return f, sum(f.values()), sum(len(w) * f[w] for w in f.keys())\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')\n",
    "    print('|语料库|字符数|词语种数|总词频|平均词长|字符数|词语种数|总词频|平均词长|OOV|')\n",
    "    for data in 'pku', 'msr', 'as', 'cityu':\n",
    "        train_path = os.path.join(sighan05, 'training', '{}_training.utf8'.format(data))\n",
    "        test_path = os.path.join(sighan05, 'gold',\n",
    "                                 ('{}_testing_gold.utf8' if data == 'as' else '{}_test_gold.utf8').format(data))\n",
    "        print(\n",
    "            '|%s|%.0f万|%.0f万|%.0f万|%.1f|%.0f万|%.0f万|%.0f万|%.1f|%.2f%%|' % (\n",
    "                (data.upper(),) + count_corpus(train_path, test_path)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch03/demo_corpus_loader.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-06 12:51\n",
    "# 《自然语言处理入门》3.3.1 加载语料库\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "CorpusLoader = SafeJClass('com.hankcs.hanlp.corpus.document.CorpusLoader')\n",
    "\n",
    "\n",
    "def my_cws_corpus():\n",
    "    data_root = test_data_path()\n",
    "    corpus_path = os.path.join(data_root, 'my_cws_corpus.txt')\n",
    "    if not os.path.isfile(corpus_path):\n",
    "        with open(corpus_path, 'w', encoding='utf-8') as out:\n",
    "            out.write('''商品 和 服务\n",
    "商品 和服 物美价廉\n",
    "服务 和 货币''')\n",
    "    return corpus_path\n",
    "\n",
    "\n",
    "def load_cws_corpus(corpus_path):\n",
    "    return CorpusLoader.convert2SentenceList(corpus_path)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    corpus_path = my_cws_corpus()\n",
    "    sents = load_cws_corpus(corpus_path)\n",
    "    for sent in sents:\n",
    "        print(sent)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第04章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch04/doctor_hmm.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-17 11:32\n",
    "# 《自然语言处理入门》4.4 隐马尔可夫模型的训练\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import numpy as np\n",
    "from pyhanlp import *\n",
    "from jpype import JArray, JFloat, JInt\n",
    "\n",
    "to_str = JClass('java.util.Arrays').toString\n",
    "\n",
    "states = ('Healthy', 'Fever')\n",
    "start_probability = {'Healthy': 0.6, 'Fever': 0.4}\n",
    "transition_probability = {\n",
    "    'Healthy': {'Healthy': 0.7, 'Fever': 0.3},\n",
    "    'Fever': {'Healthy': 0.4, 'Fever': 0.6},\n",
    "}\n",
    "emission_probability = {\n",
    "    'Healthy': {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},\n",
    "    'Fever': {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6},\n",
    "}\n",
    "observations = ('normal', 'cold', 'dizzy')\n",
    "\n",
    "\n",
    "def generate_index_map(lables):\n",
    "    index_label = {}\n",
    "    label_index = {}\n",
    "    i = 0\n",
    "    for l in lables:\n",
    "        index_label[i] = l\n",
    "        label_index[l] = i\n",
    "        i += 1\n",
    "    return label_index, index_label\n",
    "\n",
    "\n",
    "states_label_index, states_index_label = generate_index_map(states)\n",
    "observations_label_index, observations_index_label = generate_index_map(observations)\n",
    "\n",
    "\n",
    "def convert_observations_to_index(observations, label_index):\n",
    "    list = []\n",
    "    for o in observations:\n",
    "        list.append(label_index[o])\n",
    "    return list\n",
    "\n",
    "\n",
    "def convert_map_to_vector(map, label_index):\n",
    "    v = np.empty(len(map), dtype=float)\n",
    "    for e in map:\n",
    "        v[label_index[e]] = map[e]\n",
    "    return JArray(JFloat, v.ndim)(v.tolist())  # 将numpy数组转为Java数组\n",
    "\n",
    "\n",
    "def convert_map_to_matrix(map, label_index1, label_index2):\n",
    "    m = np.empty((len(label_index1), len(label_index2)), dtype=float)\n",
    "    for line in map:\n",
    "        for col in map[line]:\n",
    "            m[label_index1[line]][label_index2[col]] = map[line][col]\n",
    "    return JArray(JFloat, m.ndim)(m.tolist())\n",
    "\n",
    "\n",
    "A = convert_map_to_matrix(transition_probability, states_label_index, states_label_index)\n",
    "B = convert_map_to_matrix(emission_probability, states_label_index, observations_label_index)\n",
    "observations_index = convert_observations_to_index(observations, observations_label_index)\n",
    "pi = convert_map_to_vector(start_probability, states_label_index)\n",
    "\n",
    "FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')\n",
    "given_model = FirstOrderHiddenMarkovModel(pi, A, B)\n",
    "\n",
    "for O, S in given_model.generate(3, 5, 2):\n",
    "    print(\" \".join((observations_index_label[o] + '/' + states_index_label[s]) for o, s in zip(O, S)))\n",
    "\n",
    "trained_model = FirstOrderHiddenMarkovModel()\n",
    "trained_model.train(given_model.generate(3, 10, 100000))\n",
    "assert trained_model.similar(given_model)\n",
    "trained_model.unLog()  # 将对数形式的概率还原回来\n",
    "\n",
    "print(trained_model.start_probability)\n",
    "for vec in trained_model.transition_probability:\n",
    "    print(vec)\n",
    "for vec in trained_model.emission_probability:\n",
    "    print(vec)\n",
    "\n",
    "pred = JArray(JInt, 1)([0, 0, 0])\n",
    "prob = given_model.predict(observations_index, pred)\n",
    "print(\" \".join((observations_index_label[o] + '/' + states_index_label[s]) for o, s in\n",
    "               zip(observations_index, pred)) + \" {:.3f}\".format(np.math.exp(prob)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch04/hmm_cws.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-19 14:33\n",
    "# 《自然语言处理入门》4.6 隐马尔可夫模型应用于中文分词\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.eval_bigram_cws import CWSEvaluator\n",
    "from tests.book.ch03.msr import msr_dict, msr_train, msr_model, msr_test, msr_output, msr_gold\n",
    "\n",
    "FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')\n",
    "SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')\n",
    "HMMSegmenter = JClass('com.hankcs.hanlp.model.hmm.HMMSegmenter')\n",
    "\n",
    "\n",
    "def train(corpus, model):\n",
    "    segmenter = HMMSegmenter(model)\n",
    "    segmenter.train(corpus)\n",
    "    print(segmenter.segment('商品和服务'))\n",
    "    return segmenter.toSegment()\n",
    "\n",
    "\n",
    "def evaluate(segment):\n",
    "    result = CWSEvaluator.evaluate(segment, msr_test, msr_output, msr_gold, msr_dict)\n",
    "    print(result)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    segment = train(msr_train, FirstOrderHiddenMarkovModel())\n",
    "    evaluate(segment)\n",
    "    segment = train(msr_train, SecondOrderHiddenMarkovModel())\n",
    "    evaluate(segment)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第05章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/plot_2d_sgd.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-26 16:49\n",
    "# 《自然语言处理入门》5.2.4 损失函数与随机梯度下降\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import numpy as np\n",
    "from tests.book.ch05.plot_name import newline\n",
    "import matplotlib\n",
    "from matplotlib import pyplot as plt\n",
    "fontpath = ensure_data('SimHei.ttf', 'https://github.com/StellarCN/scp_zh/raw/master/fonts/SimHei.ttf')\n",
    "myfont = matplotlib.font_manager.FontProperties(fname=fontpath)\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "\n",
    "x = np.linspace(-1.5, 1.5)\n",
    "y = x ** 2\n",
    "\n",
    "newline([1, 0], [1, 1], color='g')\n",
    "\n",
    "plt.ylim([0, 2])\n",
    "\n",
    "plt.plot(x, y)\n",
    "plt.title('$J(w)=w^2$')\n",
    "plt.xlabel('$w$')\n",
    "plt.ylabel('$J(w)$')\n",
    "plt.annotate('梯度$\\Delta w = 2$', xy=(1, 1), xytext=(0, 1.0), ha='center',\n",
    "             arrowprops=dict(facecolor='black', shrink=0.05),fontproperties=myfont\n",
    "             )\n",
    "\n",
    "bbox_props = dict(boxstyle=\"larrow\", fc='w', ec=\"black\", lw=2)\n",
    "t = plt.text(0.6, 0.1, \"梯度下降方向\", ha=\"center\", va=\"center\", rotation=0,\n",
    "             bbox=bbox_props,fontproperties=myfont)\n",
    "bbox_props['boxstyle'] = 'rarrow'\n",
    "plt.text(1.4, 0.1, \"梯度上升方向\", ha=\"center\", va=\"center\", rotation=0,\n",
    "         bbox=bbox_props,fontproperties=myfont)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/classify_name.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-21 19:46\n",
    "# 《自然语言处理入门》5.3 基于感知机的人名性别分类\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "PerceptronNameGenderClassifier = JClass('com.hankcs.hanlp.model.perceptron.PerceptronNameGenderClassifier')\n",
    "cnname = ensure_data('cnname', 'http://file.hankcs.com/corpus/cnname.zip')\n",
    "TRAINING_SET = os.path.join(cnname, 'train.csv')\n",
    "TESTING_SET = os.path.join(cnname, 'test.csv')\n",
    "MODEL = cnname + \".bin\"\n",
    "\n",
    "\n",
    "def run_classifier(averaged_perceptron):\n",
    "    print('=====%s=====' % ('平均感知机算法' if averaged_perceptron else '朴素感知机算法'))\n",
    "    classifier = PerceptronNameGenderClassifier()\n",
    "    print('训练集准确率：', classifier.train(TRAINING_SET, 10, averaged_perceptron))\n",
    "    model = classifier.getModel()\n",
    "    print('特征数量：', len(model.parameter))\n",
    "    # model.save(MODEL, model.featureMap.entrySet(), 0, True)\n",
    "    # classifier = PerceptronNameGenderClassifier(MODEL)\n",
    "    for name in \"赵建军\", \"沈雁冰\", \"陆雪琪\", \"李冰冰\":\n",
    "        print('%s=%s' % (name, classifier.predict(name)))\n",
    "    print('测试集准确率：', classifier.evaluate(TESTING_SET))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    run_classifier(False)\n",
    "    run_classifier(True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/plot_3d_sgd.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：https://am207.github.io/2017/wiki/gradientdescent.html\n",
    "# Date: 2018-06-26 17:15\n",
    "# 《自然语言处理入门》5.2.4 损失函数与随机梯度下降\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from sklearn.datasets import make_regression\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "\n",
    "\n",
    "def gradient_descent(x, y, theta_init, step=0.001, maxsteps=0, precision=0.001, ):\n",
    "    costs = []\n",
    "    m = y.size  # number of data points\n",
    "    theta = theta_init\n",
    "    history = []  # to store all thetas\n",
    "    preds = []\n",
    "    counter = 0\n",
    "    oldcost = 0\n",
    "    pred = np.dot(x, theta)\n",
    "    error = pred - y\n",
    "    currentcost = np.sum(error ** 2) / (2 * m)\n",
    "    preds.append(pred)\n",
    "    costs.append(currentcost)\n",
    "    history.append(theta)\n",
    "    counter += 1\n",
    "    while abs(currentcost - oldcost) > precision:\n",
    "        oldcost = currentcost\n",
    "        gradient = x.T.dot(error) / m\n",
    "        theta = theta - step * gradient  # update\n",
    "        history.append(theta)\n",
    "\n",
    "        pred = np.dot(x, theta)\n",
    "        error = pred - y\n",
    "        currentcost = np.sum(error ** 2) / (2 * m)\n",
    "        costs.append(currentcost)\n",
    "\n",
    "        if counter % 25 == 0: preds.append(pred)\n",
    "        counter += 1\n",
    "        if maxsteps:\n",
    "            if counter == maxsteps:\n",
    "                break\n",
    "\n",
    "    return history, costs, preds, counter\n",
    "\n",
    "\n",
    "x, y = make_regression(n_samples=100,\n",
    "                       n_features=1,\n",
    "                       n_informative=1,\n",
    "                       noise=20,\n",
    "                       random_state=2017)\n",
    "x = x.flatten()\n",
    "\n",
    "xaug = np.c_[np.ones(x.shape[0]), x]\n",
    "theta_i = [-15, 40] + np.random.rand(2)\n",
    "history, cost, preds, iters = gradient_descent(xaug, y, theta_i, step=0.1)\n",
    "theta = history[-1]\n",
    "\n",
    "\n",
    "def error(X, Y, THETA):\n",
    "    return np.sum((X.dot(THETA) - Y) ** 2) / (2 * Y.size)\n",
    "\n",
    "\n",
    "ms = np.linspace(theta[0] - 20, theta[0] + 20, 20)\n",
    "bs = np.linspace(theta[1] - 40, theta[1] + 40, 40)\n",
    "\n",
    "M, B = np.meshgrid(ms, bs)\n",
    "\n",
    "zs = np.array([error(xaug, y, theta)\n",
    "               for theta in zip(np.ravel(M), np.ravel(B))])\n",
    "Z = zs.reshape(M.shape)\n",
    "\n",
    "fig = plt.figure(figsize=(10, 6))\n",
    "ax = fig.add_subplot(111, projection='3d')\n",
    "\n",
    "ax.plot_surface(M, B, Z, rstride=1, cstride=1, color='b', alpha=0.2)\n",
    "ax.contour(M, B, Z, 20, alpha=0.5, offset=0, stride=30)\n",
    "\n",
    "ax.set_xlabel('$w_1$')\n",
    "ax.set_ylabel('$w_2$')\n",
    "ax.set_zlabel('$J(w_1,w_2)$')\n",
    "ax.view_init(elev=30., azim=30)\n",
    "ax.plot([theta[0]], [theta[1]], [cost[-1]], markerfacecolor='r', markeredgecolor='r', marker='>', markersize=7)\n",
    "ax.plot([history[0][0]], [history[0][1]], [cost[0]], markerfacecolor='r', markeredgecolor='r', marker='8', markersize=7)\n",
    "\n",
    "ax.plot([history[0][0]], [history[0][1]], 0, markerfacecolor='r', markeredgecolor='r', marker='8',\n",
    "        markersize=7)\n",
    "ax.plot([t[0] for t in history], [t[1] for t in history], cost, markerfacecolor='r', markeredgecolor='r', marker='.',\n",
    "        markersize=2)\n",
    "ax.plot([t[0] for t in history], [t[1] for t in history], 0, markerfacecolor='r', markeredgecolor='r', marker='.',\n",
    "        markersize=2)\n",
    "ax.plot([history[-1][0]], [history[-1][1]], 0, markerfacecolor='r', markeredgecolor='r', marker='>',\n",
    "        markersize=7)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/plot_name.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-20 10:02\n",
    "# 《自然语言处理入门》5.2 线性分类模型与感知机算法\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.lines as mlines\n",
    "import matplotlib\n",
    "fontpath = ensure_data('SimHei.ttf', 'https://github.com/StellarCN/scp_zh/raw/master/fonts/SimHei.ttf')\n",
    "myfont = matplotlib.font_manager.FontProperties(fname=fontpath)\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "\n",
    "\n",
    "def newline(p1, p2, color=None, marker=None):\n",
    "    \"\"\"\n",
    "    https://stackoverflow.com/questions/36470343/how-to-draw-a-line-with-matplotlib\n",
    "    :param p1:\n",
    "    :param p2:\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    ax = plt.gca()\n",
    "    xmin, xmax = ax.get_xbound()\n",
    "\n",
    "    if (p2[0] == p1[0]):\n",
    "        xmin = xmax = p1[0]\n",
    "        ymin, ymax = ax.get_ybound()\n",
    "    else:\n",
    "        ymax = p1[1] + (p2[1] - p1[1]) / (p2[0] - p1[0]) * (xmax - p1[0])\n",
    "        ymin = p1[1] + (p2[1] - p1[1]) / (p2[0] - p1[0]) * (xmin - p1[0])\n",
    "\n",
    "    l = mlines.Line2D([xmin, xmax], [ymin, ymax], color=color, marker=marker)\n",
    "    ax.add_line(l)\n",
    "    return l\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    male = [1, 1]\n",
    "    female = [0, 1]\n",
    "\n",
    "    fig, ax = plt.subplots()\n",
    "    m = ax.scatter(male[0], male[1], s=60, c='blue', marker='x')\n",
    "    ax.annotate('沈雁冰', male,fontproperties=myfont)\n",
    "\n",
    "    f = ax.scatter(female[0], female[1], s=60, c='red', marker='o')\n",
    "    ax.annotate('冰心', female,fontproperties=myfont)\n",
    "\n",
    "    ax.legend((m, f), (' 男', ' 女'))\n",
    "\n",
    "    plt.xlim(-0.1, 1.5)\n",
    "    plt.ylim(-0.1, 1.5)\n",
    "    plt.xticks([0, 1])\n",
    "    plt.yticks([0, 1])\n",
    "    plt.title('性别分类问题',fontproperties=myfont)\n",
    "    newline([0.5, 0], [1, 1.5])\n",
    "    ax.annotate('3x-y-1.5=0', [0.75, 0.6])\n",
    "\n",
    "    plt.xlabel('特征1：是否含“雁”',fontproperties=myfont)\n",
    "    plt.ylabel('特征2：是否含“冰”',fontproperties=myfont)\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/eval_perceptron_cws.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-06-01 11:07\n",
    "# 《自然语言处理入门》5.6 基于结构化感知机的中文分词\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.msr import msr_train, msr_model, msr_test, msr_gold, msr_output, msr_dict\n",
    "\n",
    "CWSTrainer = JClass('com.hankcs.hanlp.model.perceptron.CWSTrainer')\n",
    "PerceptronLexicalAnalyzer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer')\n",
    "LinearModel = JClass('com.hankcs.hanlp.model.perceptron.model.LinearModel')\n",
    "Segment = JClass('com.hankcs.hanlp.seg.Segment')\n",
    "CWSEvaluator = JClass('com.hankcs.hanlp.seg.common.CWSEvaluator')\n",
    "\n",
    "\n",
    "def trainStructuredPerceptron():\n",
    "    model = CWSTrainer().train(msr_train, msr_train, msr_model, 0., 10, 8).getModel()\n",
    "    return PerceptronLexicalAnalyzer(model).enableCustomDictionary(False)\n",
    "\n",
    "\n",
    "def trainAveragedPerceptron():\n",
    "    model = CWSTrainer().train(msr_train, msr_train, msr_model, 0., 10, 1).getModel()\n",
    "    return PerceptronLexicalAnalyzer(model).enableCustomDictionary(False)\n",
    "\n",
    "\n",
    "print(\"结构化感知机\")\n",
    "print(CWSEvaluator.evaluate(trainStructuredPerceptron(), msr_test, msr_output, msr_gold, msr_dict))\n",
    "print(\"平均感知机\")\n",
    "print(CWSEvaluator.evaluate(trainAveragedPerceptron(), msr_test, msr_output, msr_gold, msr_dict))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/online_learning.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-25 13:35\n",
    "# 《自然语言处理入门》5.6.6 模型调整与在线学习\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import PerceptronLexicalAnalyzer, HanLP, CustomDictionary\n",
    "from tests.book.ch03.msr import msr_model\n",
    "\n",
    "HanLP.Config.ShowTermNature = False\n",
    "segment = PerceptronLexicalAnalyzer(msr_model).enableCustomDictionary(False)\n",
    "text = \"与川普通电话\"\n",
    "print(segment.seg(text))\n",
    "\n",
    "CustomDictionary.insert(\"川普\", \"nrf 1\")\n",
    "segment.enableCustomDictionaryForcing(True)\n",
    "print(segment.seg(text))\n",
    "\n",
    "print(segment.seg(\"银川普通人与川普通电话讲四川普通话\"))\n",
    "\n",
    "segment.enableCustomDictionary(False)\n",
    "for i in range(3):                                  # 学三遍\n",
    "    segment.learn(\"人 与 川普 通电话\")                # 在线学习接口的输入必须是标注样本\n",
    "print(segment.seg(\"银川普通人与川普通电话讲四川普通话\"))\n",
    "print(segment.seg(\"首相与川普通话讨论四川普通高考\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/plot_compressed_f1.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-22 17:53\n",
    "# 《自然语言处理入门》5.6.3 特征裁剪与模型压缩\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import os\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "from jpype import JClass\n",
    "\n",
    "from tests.book.ch03.eval_bigram_cws import CWSEvaluator\n",
    "from tests.book.ch03.msr import msr_train, msr_model, msr_gold, msr_dict, msr_output, msr_test\n",
    "from tests.book.ch05.perceptron_cws import CWSTrainer, PerceptronLexicalAnalyzer\n",
    "\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "\n",
    "\n",
    "def train_evaluate(ratios):\n",
    "    if not os.path.isfile(msr_model):\n",
    "        model = CWSTrainer().train(msr_train, msr_train, msr_model, 0, 10, 8).getModel()  # 训练模型\n",
    "    else:\n",
    "        model = JClass('com.hankcs.hanlp.model.perceptron.model.LinearModel')(msr_model)\n",
    "    pre = None\n",
    "    scores = []\n",
    "    for c in ratios:\n",
    "        if pre:\n",
    "            print('以压缩比{}压缩模型中...'.format(c))\n",
    "            model.compress(1 - (1 - c) / pre, 0)\n",
    "        pre = 1 - c\n",
    "        result = CWSEvaluator.evaluate(PerceptronLexicalAnalyzer(model).enableCustomDictionary(False),\n",
    "                                       msr_test, msr_output, msr_gold, msr_dict)\n",
    "        # scores.append(result.F1)\n",
    "        scores.append(float(str(result).split()[2][3:]))\n",
    "    return scores\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    x = [c / 10 for c in range(0, 10)]\n",
    "    y = train_evaluate(x)\n",
    "    plt.title(\"压缩率对准确率的影响\",fontproperties=myfont)\n",
    "    plt.xlabel(\"压缩率\",fontproperties=myfont)\n",
    "    plt.ylabel(\"准确率\",fontproperties=myfont)\n",
    "    plt.xticks([c / 10 for c in range(0, 11)])\n",
    "    # plt.ylim(min(y), max(y))\n",
    "    plt.plot(x, y, color='b')\n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/perceptron_cws.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-22 15:18\n",
    "# 《自然语言处理入门》5.6 基于结构化感知机的中文分词\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.eval_bigram_cws import CWSEvaluator\n",
    "from tests.book.ch03.msr import msr_train, msr_model, msr_dict, msr_gold, msr_output, msr_test\n",
    "\n",
    "CWSTrainer = JClass('com.hankcs.hanlp.model.perceptron.CWSTrainer')\n",
    "\n",
    "\n",
    "def train_uncompressed_model():\n",
    "    model = CWSTrainer().train(msr_train, msr_train, msr_model, 0., 10, 8).getModel()  # 训练模型\n",
    "    model.save(msr_model, model.featureMap.entrySet(), 0, True)  # 最后一个参数指定导出txt\n",
    "\n",
    "\n",
    "def train():\n",
    "    model = CWSTrainer().train(msr_train, msr_model).getModel()  # 训练模型\n",
    "    segment = PerceptronLexicalAnalyzer(model).enableCustomDictionary(False)  # 创建分词器\n",
    "    return segment\n",
    "    # print(CWSEvaluator.evaluate(segment, msr_test, msr_output, msr_gold, msr_dict))  # 标准化评测\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    segment = train()\n",
    "    sents = [\n",
    "        \"王思斌，男，１９４９年１０月生。\",\n",
    "        \"山东桓台县起凤镇穆寨村妇女穆玲英\",\n",
    "        \"现为中国艺术研究院中国文化研究所研究员。\",\n",
    "        \"我们的父母重男轻女\",\n",
    "        \"北京输气管道工程\",\n",
    "    ]\n",
    "    for sent in sents:\n",
    "        print(segment.seg(sent))\n",
    "    # train_uncompressed_model()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch05/plot_corpus_ratio_f1.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-06-22 17:53\n",
    "# 《自然语言处理入门》5.6.3 特征裁剪与模型压缩\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tempfile import NamedTemporaryFile\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "__doc__ = '试验语料库规模对准确率的影响'\n",
    "\n",
    "from tests.book.ch03.eval_bigram_cws import CWSEvaluator\n",
    "from tests.book.ch03.msr import msr_train, msr_model, msr_gold, msr_dict, msr_output, msr_test\n",
    "from tests.book.ch05.perceptron_cws import CWSTrainer, PerceptronLexicalAnalyzer\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "\n",
    "\n",
    "def train_evaluate(ratio):\n",
    "    partial_corpus = NamedTemporaryFile(delete=False).name\n",
    "    with open(msr_train, encoding='utf-8') as src, open(partial_corpus, 'w', encoding='utf-8') as dst:\n",
    "        all_lines = src.readlines()\n",
    "        dst.writelines(all_lines[:int(ratio * len(all_lines))])\n",
    "\n",
    "    model = CWSTrainer().train(partial_corpus, partial_corpus, msr_model, 0, 50, 8).getModel()  # 训练模型\n",
    "    result = CWSEvaluator.evaluate(PerceptronLexicalAnalyzer(model).enableCustomDictionary(False),\n",
    "                                   msr_test, msr_output, msr_gold, msr_dict)\n",
    "    # return result.F1\n",
    "    return float(str(result).split()[2][3:])\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    x = [r / 10 for r in range(1, 11)]\n",
    "    y = [train_evaluate(r) for r in x]\n",
    "    plt.title(\"语料库规模对准确率的影响\",fontproperties=myfont)\n",
    "    plt.xlabel(\"语料库规模（万字符）\",fontproperties=myfont)\n",
    "    plt.ylabel(\"准确率\",fontproperties=myfont)\n",
    "    plt.xticks([int(r / 10 * 405) for r in range(1, 11)])\n",
    "    plt.yticks(np.arange(91, 97.5, 0.5))\n",
    "    plt.plot([int(r / 10 * 405) for r in range(1, 11)], y, color='b')\n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第06章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch06/evaluate_crf_cws.py\n",
    "CRF训练很慢，请耐心等待"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-02 14:43\n",
    "# 《自然语言处理入门》6.4 HanLP 中的 CRF++ API\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03 import msr\n",
    "from tests.book.ch03.eval_bigram_cws import CWSEvaluator\n",
    "from tests.book.ch06.crfpp_train_hanlp_load import CRF_MODEL_PATH, CRF_MODEL_TXT_PATH\n",
    "\n",
    "CRFSegmenter = JClass('com.hankcs.hanlp.model.crf.CRFSegmenter')\n",
    "CRFLexicalAnalyzer = JClass('com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer')\n",
    "\n",
    "\n",
    "def train(corpus):\n",
    "    segmenter = CRFSegmenter(None)\n",
    "    segmenter.train(corpus, CRF_MODEL_PATH)\n",
    "    return CRFLexicalAnalyzer(segmenter)\n",
    "    # 训练完毕时，可传入txt格式的模型（不可传入CRF++的二进制模型，不兼容！）\n",
    "    # return CRFLexicalAnalyzer(CRF_MODEL_TXT_PATH).enableCustomDictionary(False)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    segment = train(msr.msr_train)\n",
    "    print(CWSEvaluator.evaluate(segment, msr.msr_test, msr.msr_output, msr.msr_gold, msr.msr_dict))  # 标准化评测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch06/crfpp_train_hanlp_load.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-01 19:15\n",
    "# 《自然语言处理入门》6.4 HanLP 中的 CRF++ API\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from pyhanlp.static import HANLP_JAR_PATH\n",
    "from tests.book.ch03.demo_corpus_loader import my_cws_corpus\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "CRFSegmenter = JClass('com.hankcs.hanlp.model.crf.CRFSegmenter')\n",
    "\n",
    "TXT_CORPUS_PATH = my_cws_corpus()\n",
    "TSV_CORPUS_PATH = TXT_CORPUS_PATH + \".tsv\"\n",
    "TEMPLATE_PATH = test_data_path() + \"/cws-template.txt\"\n",
    "CRF_MODEL_PATH = test_data_path() + \"/crf-cws-model\"\n",
    "CRF_MODEL_TXT_PATH = test_data_path() + \"/crf-cws-model.txt\"\n",
    "\n",
    "\n",
    "def train_or_load(corpus_txt_path=TXT_CORPUS_PATH, model_txt_path=CRF_MODEL_TXT_PATH):\n",
    "    if os.path.isfile(model_txt_path):  # 已训练，直接加载\n",
    "        segmenter = CRFSegmenter(model_txt_path)\n",
    "        return segmenter\n",
    "    else:\n",
    "        segmenter = CRFSegmenter()  # 创建空白分词器\n",
    "        segmenter.convertCorpus(corpus_txt_path, TSV_CORPUS_PATH)  # 执行转换\n",
    "        segmenter.dumpTemplate(TEMPLATE_PATH)  # 导出特征模板\n",
    "        # 交给CRF++训练\n",
    "        print(\"语料已转换为 %s ，特征模板已导出为 %s\" % (TSV_CORPUS_PATH, TEMPLATE_PATH))\n",
    "        print(\"请安装CRF++后执行 crf_learn -f 3 -c 4.0 %s %s %s -t\" % (TEMPLATE_PATH, TSV_CORPUS_PATH, CRF_MODEL_PATH))\n",
    "        print(\"或者执行移植版 java -cp %s com.hankcs.hanlp.model.crf.crfpp.crf_learn -f 3 -c 4.0 %s %s %s -t\" % (\n",
    "            HANLP_JAR_PATH, TEMPLATE_PATH, TSV_CORPUS_PATH, CRF_MODEL_PATH))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    segment = train_or_load()\n",
    "    if segment:\n",
    "        print(segment.segment(\"商品和服务\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch06/plot_3d_sgd_newton.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：https://am207.github.io/2017/wiki/gradientdescent.html\n",
    "# Date: 2018-06-26 17:15\n",
    "# 《自然语言处理入门》6.2.2 条件随机场的训练\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from sklearn.datasets import make_regression\n",
    "\n",
    "\n",
    "def gradient_descent(x, y, theta_init, step=0.001, maxsteps=0, precision=0.001, ):\n",
    "    costs = []\n",
    "    m = y.size  # number of data points\n",
    "    theta = theta_init\n",
    "    history = []  # to store all thetas\n",
    "    preds = []\n",
    "    counter = 0\n",
    "    oldcost = 0\n",
    "    pred = np.dot(x, theta)\n",
    "    error = pred - y\n",
    "    currentcost = np.sum(error ** 2) / (2 * m)\n",
    "    preds.append(pred)\n",
    "    costs.append(currentcost)\n",
    "    history.append(theta)\n",
    "    counter += 1\n",
    "    while abs(currentcost - oldcost) > precision:\n",
    "        oldcost = currentcost\n",
    "        gradient = x.T.dot(error) / m\n",
    "        theta = theta - step * gradient  # update\n",
    "        history.append(theta)\n",
    "\n",
    "        pred = np.dot(x, theta)\n",
    "        error = pred - y\n",
    "        currentcost = np.sum(error ** 2) / (2 * m)\n",
    "        costs.append(currentcost)\n",
    "\n",
    "        if counter % 25 == 0: preds.append(pred)\n",
    "        counter += 1\n",
    "        if maxsteps:\n",
    "            if counter == maxsteps:\n",
    "                break\n",
    "\n",
    "    return history, costs, preds, counter\n",
    "\n",
    "\n",
    "x, y = make_regression(n_samples=100,\n",
    "                       n_features=1,\n",
    "                       n_informative=1,\n",
    "                       noise=20,\n",
    "                       random_state=66)\n",
    "x = x.flatten()\n",
    "\n",
    "xaug = np.c_[np.ones(x.shape[0]), x]\n",
    "theta_i = [-15, 40] + np.random.rand(2)\n",
    "history, cost, preds, iters = gradient_descent(xaug, y, theta_i, step=0.1)\n",
    "theta = history[-1]\n",
    "\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "\n",
    "\n",
    "def error(X, Y, THETA):\n",
    "    return np.sum((X.dot(THETA) - Y) ** 2) / (2 * Y.size)\n",
    "\n",
    "\n",
    "ms = np.linspace(theta[0] - 20, theta[0] + 20, 20)\n",
    "bs = np.linspace(theta[1] - 40, theta[1] + 40, 40)\n",
    "\n",
    "M, B = np.meshgrid(ms, bs)\n",
    "\n",
    "zs = np.array([error(xaug, y, theta)\n",
    "               for theta in zip(np.ravel(M), np.ravel(B))])\n",
    "Z = zs.reshape(M.shape)\n",
    "\n",
    "fig = plt.figure(figsize=(10, 6))\n",
    "ax = fig.add_subplot(111, projection='3d')\n",
    "\n",
    "ax.plot_surface(M, B, Z, rstride=1, cstride=1, color='b', alpha=0.2)\n",
    "ax.contour(M, B, Z, 20, color='b', alpha=0.5, offset=0, stride=30)\n",
    "\n",
    "ax.set_xlabel('$w_1$')\n",
    "ax.set_ylabel('$w_2$')\n",
    "ax.set_zlabel('$J(w_1,w_2)$')\n",
    "ax.view_init(elev=30., azim=30)\n",
    "ax.plot([theta[0]], [theta[1]], [cost[-1]], markerfacecolor='r', markeredgecolor='r', marker='<', markersize=7)\n",
    "ax.plot([history[0][0]], [history[0][1]], [cost[0]], markerfacecolor='r', markeredgecolor='r', marker='8', markersize=7)\n",
    "\n",
    "ax.plot([history[0][0]], [history[0][1]], 0, markerfacecolor='r', markeredgecolor='r', marker='8',\n",
    "        markersize=7)\n",
    "ax.plot([t[0] for t in history], [t[1] for t in history], cost, markerfacecolor='r', markeredgecolor='r', marker='.',\n",
    "        markersize=1)\n",
    "ax.plot([t[0] for t in history], [t[1] for t in history], 0, markerfacecolor='r', markeredgecolor='r', marker='.',\n",
    "        markersize=1)\n",
    "ax.plot([history[-1][0]], [history[-1][1]], 0, markerfacecolor='r', markeredgecolor='r', marker='<',\n",
    "        markersize=7)\n",
    "\n",
    "k = (history[0][1] - history[-1][1]) / (history[0][0] - history[-1][0])\n",
    "b = history[0][1] - k * history[0][0]\n",
    "ax.plot([t[0] for t in history], [t[0] * k + b for t in history], 0, markerfacecolor='b', markeredgecolor='b',\n",
    "        marker=',',\n",
    "        markersize=1)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch06/plot_2d_newton.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 《自然语言处理入门》6.2.2 条件随机场的训练\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "leafNode = dict(boxstyle=\"round4\", fc=\"0.8\")\n",
    "arrow_args = dict(arrowstyle=\"<-\")\n",
    "\n",
    "matplotlib.rcParams['xtick.direction'] = 'out'\n",
    "matplotlib.rcParams['ytick.direction'] = 'out'\n",
    "\n",
    "delta = 0.025\n",
    "x = np.arange(-2.0, 2.0, delta)\n",
    "y = np.arange(-2.0, 2.0, delta)\n",
    "X, Y = np.meshgrid(x, y)\n",
    "Z1 = -((X - 1) ** 2)\n",
    "Z2 = -(Y ** 2)\n",
    "Z = -1.0 * (Z2 + Z1) + 1\n",
    "\n",
    "plt.figure()\n",
    "CS = plt.contour(X, Y, Z)\n",
    "plt.annotate('', xy=(0.05, 0.05), xycoords='axes fraction',\n",
    "             xytext=(0.2, 0.2), textcoords='axes fraction',\n",
    "             va=\"center\", ha=\"center\", bbox=leafNode, arrowprops=arrow_args)\n",
    "plt.text(-1.85, -1.67, '$P_0$')\n",
    "\n",
    "plt.annotate('', xy=(0.2, 0.2), xycoords='axes fraction',\n",
    "             xytext=(0.35, 0.33), textcoords='axes fraction',\n",
    "             va=\"center\", ha=\"center\", bbox=leafNode, arrowprops=arrow_args)\n",
    "plt.text(-1.32, -1.23, '$P_1$')\n",
    "\n",
    "plt.annotate('', xy=(0.35, 0.33), xycoords='axes fraction',\n",
    "             xytext=(0.52, 0.43), textcoords='axes fraction',\n",
    "             va=\"center\", ha=\"center\", bbox=leafNode, arrowprops=arrow_args)\n",
    "plt.text(-0.7, -0.65, '$P_2$')\n",
    "\n",
    "plt.annotate('', xy=(0.52, 0.43), xycoords='axes fraction',\n",
    "             xytext=(0.75, 0.5), textcoords='axes fraction',\n",
    "             va=\"center\", ha=\"center\", bbox=leafNode, arrowprops=arrow_args)\n",
    "plt.text(0., -0.24, '$P_3$')\n",
    "plt.text(0.95, -0.1, '$P_4$')\n",
    "\n",
    "plt.annotate('', xy=(0.05, 0.05), xycoords='axes fraction',\n",
    "             xytext=(0.75, 0.5), textcoords='axes fraction',\n",
    "             va=\"center\", ha=\"center\", bbox=leafNode, arrowprops={\"arrowstyle\": \"<-\", 'ls': 'dashed'})\n",
    "\n",
    "plt.xticks([])\n",
    "plt.yticks([])\n",
    "plt.clabel(CS, inline=1, fontsize=10)\n",
    "plt.title('梯度下降',fontproperties=myfont)\n",
    "plt.xlabel('$w_1$')\n",
    "plt.ylabel('$w_2$')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第07章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/demo_perceptron_pos.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-05 10:19\n",
    "# 《自然语言处理入门》7.3.2 基于感知机的词性标注\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter\n",
    "from tests.book.ch07.pku import PKU199801_TRAIN, POS_MODEL\n",
    "\n",
    "POSTrainer = JClass('com.hankcs.hanlp.model.perceptron.POSTrainer')\n",
    "PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')\n",
    "\n",
    "\n",
    "def train_perceptron_pos(corpus):\n",
    "    trainer = POSTrainer()\n",
    "    trainer.train(corpus, POS_MODEL)  # 训练\n",
    "    tagger = PerceptronPOSTagger(POS_MODEL)  # 加载\n",
    "    print(', '.join(tagger.tag(\"他\", \"的\", \"希望\", \"是\", \"希望\", \"上学\")))  # 预测\n",
    "    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger)  # 构造词法分析器\n",
    "    print(analyzer.analyze(\"李狗蛋的希望是希望上学\"))  # 分词+词性标注\n",
    "    return tagger\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    train_perceptron_pos(PKU199801_TRAIN)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/custom_corpus_pos.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-06 13:54\n",
    "# 《自然语言处理入门》7.4.2 标注语料\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter\n",
    "from tests.book.ch07.demo_perceptron_pos import train_perceptron_pos\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "ZHUXIAN = ensure_data(\"zhuxian\", \"http://file.hankcs.com/corpus/zhuxian.zip\") + \"/train.txt\"\n",
    "posTagger = train_perceptron_pos(ZHUXIAN)  # 训练\n",
    "analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), posTagger)  # 包装\n",
    "print(analyzer.analyze(\"陆雪琪的天琊神剑不做丝毫退避，直冲而上，瞬间，这两道奇光异宝撞到了一起。\"))  # 分词+标注"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/demo_hmm_pos.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-04 17:34\n",
    "# 《自然语言处理入门》7.3.1 基于隐马尔可夫模型的词性标注\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from  pyhanlp import *\n",
    "from tests.book.ch07.pku import PKU199801_TRAIN\n",
    "\n",
    "HMMPOSTagger = JClass('com.hankcs.hanlp.model.hmm.HMMPOSTagger')\n",
    "AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')\n",
    "PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')\n",
    "FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')\n",
    "SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')\n",
    "\n",
    "def train_hmm_pos(corpus, model):\n",
    "    tagger = HMMPOSTagger(model)  # 创建词性标注器\n",
    "    tagger.train(corpus)  # 训练\n",
    "    print(', '.join(tagger.tag(\"他\", \"的\", \"希望\", \"是\", \"希望\", \"上学\")))  # 预测\n",
    "    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger)  # 构造词法分析器\n",
    "    print(analyzer.analyze(\"他的希望是希望上学\"))  # 分词+词性标注\n",
    "    return tagger\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    tagger = train_hmm_pos(PKU199801_TRAIN, FirstOrderHiddenMarkovModel())\n",
    "    tagger = train_hmm_pos(PKU199801_TRAIN, SecondOrderHiddenMarkovModel())  # 或二阶隐马"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/evaluate_pos.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-05 15:12\n",
    "# 《自然语言处理入门》7.3.1 基于隐马尔可夫模型的词性标注\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07.demo_crf_pos import train_crf_pos\n",
    "from tests.book.ch07.demo_hmm_pos import train_hmm_pos, FirstOrderHiddenMarkovModel, SecondOrderHiddenMarkovModel\n",
    "from tests.book.ch07.demo_perceptron_pos import train_perceptron_pos\n",
    "from tests.book.ch07.pku import PKU199801_TRAIN, PKU199801_TEST\n",
    "\n",
    "PosTagUtil = JClass('com.hankcs.hanlp.dependency.nnparser.util.PosTagUtil')\n",
    "\n",
    "print(\"一阶HMM\\t%.2f%%\" % (\n",
    "    PosTagUtil.evaluate(train_hmm_pos(PKU199801_TRAIN, FirstOrderHiddenMarkovModel()), PKU199801_TEST)))\n",
    "print(\"二阶HMM\\t%.2f%%\" % (\n",
    "    PosTagUtil.evaluate(train_hmm_pos(PKU199801_TRAIN, SecondOrderHiddenMarkovModel()), PKU199801_TEST)))\n",
    "print(\"感知机\\t%.2f%%\" % (PosTagUtil.evaluate(train_perceptron_pos(PKU199801_TRAIN), PKU199801_TEST)))\n",
    "print(\"CRF\\t%.2f%%\" % (PosTagUtil.evaluate(train_crf_pos(PKU199801_TRAIN), PKU199801_TEST)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/custom_pos.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-05 16:14\n",
    "# 《自然语言处理入门》7.4 自定义词性\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "\n",
    "CustomDictionary.insert(\"苹果\", \"手机品牌 1\")\n",
    "CustomDictionary.insert(\"iPhone X\", \"手机型号 1\")\n",
    "analyzer = PerceptronLexicalAnalyzer()\n",
    "analyzer.enableCustomDictionaryForcing(True)\n",
    "print(analyzer.analyze(\"你们苹果iPhone X保修吗？\"))\n",
    "print(analyzer.analyze(\"多吃苹果有益健康\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/demo_crf_pos.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-05 10:56\n",
    "# 《自然语言处理入门》7.3.3 基于条件随机场的词性标注\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter\n",
    "from tests.book.ch07.pku import POS_MODEL, PKU199801_TRAIN\n",
    "\n",
    "CRFPOSTagger = JClass('com.hankcs.hanlp.model.crf.CRFPOSTagger')\n",
    "\n",
    "\n",
    "def train_crf_pos(corpus):\n",
    "    # 选项1.使用HanLP的Java API训练，慢\n",
    "    tagger = CRFPOSTagger(None)  # 创建空白标注器\n",
    "    tagger.train(corpus, POS_MODEL)  # 训练\n",
    "    tagger = CRFPOSTagger(POS_MODEL) # 加载\n",
    "    # 选项2.使用CRF++训练，HanLP加载。（训练命令由选项1给出）\n",
    "    # tagger = CRFPOSTagger(POS_MODEL + \".txt\")\n",
    "    print(', '.join(tagger.tag(\"他\", \"的\", \"希望\", \"是\", \"希望\", \"上学\")))  # 预测\n",
    "    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger)  # 构造词法分析器\n",
    "    print(analyzer.analyze(\"李狗蛋的希望是希望上学\"))  # 分词+词性标注\n",
    "    return tagger\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    tagger = train_crf_pos(PKU199801_TRAIN)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch07/pku.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-04 17:41\n",
    "# 《自然语言处理入门》7.2.1 《人民日报》语料库与 PKU 标注集\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import os\n",
    "\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "PKU98 = ensure_data(\"pku98\", \"http://file.hankcs.com/corpus/pku98.zip\")\n",
    "PKU199801 = os.path.join(PKU98, '199801.txt')\n",
    "PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')\n",
    "PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')\n",
    "POS_MODEL = os.path.join(PKU98, 'pos.bin')\n",
    "NER_MODEL = os.path.join(PKU98, 'ner.bin')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第08章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_role_tag_nt.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-27 15:35\n",
    "# 《自然语言处理入门》8.4.3 基于角色标注的机构名识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.ngram_segment import DijkstraSegment\n",
    "from tests.book.ch07 import pku\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "EasyDictionary = JClass('com.hankcs.hanlp.corpus.dictionary.EasyDictionary')\n",
    "NTDictionaryMaker = JClass('com.hankcs.hanlp.corpus.dictionary.NTDictionaryMaker')\n",
    "Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence')\n",
    "MODEL = test_data_path() + \"/ns\"\n",
    "\n",
    "\n",
    "def train(corpus, model):\n",
    "    dictionary = EasyDictionary.create(HanLP.Config.CoreDictionaryPath)  # 核心词典\n",
    "    maker = NTDictionaryMaker(dictionary)  # 训练模块\n",
    "    maker.train(corpus)  # 在语料库上训练\n",
    "    maker.saveTxtTo(model)  # 输出HMM到txt\n",
    "\n",
    "\n",
    "def load(model):\n",
    "    HanLP.Config.PlaceDictionaryPath = model + \".txt\"  # data/test/ns.txt\n",
    "    HanLP.Config.PlaceDictionaryTrPath = model + \".tr.txt\"  # data/test/ns.tr.txt\n",
    "    segment = DijkstraSegment().enableOrganizationRecognize(True).enableCustomDictionary(False)  # 该分词器便于调试\n",
    "    return segment\n",
    "\n",
    "\n",
    "def test(model=MODEL):\n",
    "    segment = load(model)\n",
    "    HanLP.Config.enableDebug()\n",
    "    print(segment.seg(\"温州黄鹤皮革制造有限公司是由黄先生创办的企业\"))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    train(pku.PKU199801, MODEL)\n",
    "    test(MODEL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_num_eng.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-24 16:55\n",
    "# 《自然语言处理入门》8.2.3 基于规则的数词英文识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "from tests.book.ch03.ngram_segment import ViterbiSegment\n",
    "\n",
    "CharType = JClass('com.hankcs.hanlp.dictionary.other.CharType')\n",
    "\n",
    "segment = ViterbiSegment()\n",
    "print(segment.seg(\"牛奶三〇〇克壹佰块\"))\n",
    "print(segment.seg(\"牛奶300克100块\"))\n",
    "print(segment.seg(\"牛奶300g100rmb\"))\n",
    "# 演示自定义字符类型\n",
    "text = \"牛奶300~400g100rmb\"\n",
    "print(segment.seg(text))\n",
    "CharType.set('~', CharType.CT_NUM)\n",
    "print(segment.seg(text))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_crf_ner.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-29 18:07\n",
    "# 《自然语言处理入门》8.5.4 基于条件随机场序列标注的命名 实体识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07 import pku\n",
    "from tests.book.ch08.demo_hmm_ner import test\n",
    "\n",
    "NERTrainer = JClass('com.hankcs.hanlp.model.perceptron.NERTrainer')\n",
    "CRFNERecognizer = JClass('com.hankcs.hanlp.model.crf.CRFNERecognizer')\n",
    "\n",
    "\n",
    "def train(corpus, model):\n",
    "    recognizer = CRFNERecognizer(None)  # 空白\n",
    "    recognizer.train(corpus, model)\n",
    "    recognizer = CRFNERecognizer(model)\n",
    "    return recognizer\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    recognizer = train(pku.PKU199801_TRAIN, pku.NER_MODEL)\n",
    "    test(recognizer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/msra_ner.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-04 17:41\n",
    "# 《自然语言处理入门》8.5.3 基于感知机序列标注的命名实体识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "import os\n",
    "\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "MSRA_NER = ensure_data(\"msra-ne\", \"http://file.hankcs.com/corpus/msra-ne.zip\")\n",
    "MSRA_NER_TRAIN = os.path.join(MSRA_NER, 'train.txt')\n",
    "MSRA_NER_TEST = os.path.join(MSRA_NER, 'test.txt')\n",
    "MSRA_NER_MODEL = os.path.join(MSRA_NER, 'model.bin')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_role_tag_nr.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-24 22:04\n",
    "# 《自然语言处理入门》8.4.1 基于角色标注的中国人名识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.ngram_segment import DijkstraSegment\n",
    "from tests.book.ch07 import pku\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "EasyDictionary = JClass('com.hankcs.hanlp.corpus.dictionary.EasyDictionary')\n",
    "NRDictionaryMaker = JClass('com.hankcs.hanlp.corpus.dictionary.NRDictionaryMaker')\n",
    "Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence')\n",
    "MODEL = test_data_path() + \"/nr\"\n",
    "\n",
    "\n",
    "def demoNR():\n",
    "    HanLP.Config.enableDebug()\n",
    "    segment = DijkstraSegment()\n",
    "    print(segment.seg(\"王国维和服务员\"))\n",
    "\n",
    "\n",
    "def train_one_sent():\n",
    "    dictionary = EasyDictionary.create(HanLP.Config.CoreDictionaryPath)  # 核心词典\n",
    "    maker = NRDictionaryMaker(dictionary)  # 训练模块\n",
    "    maker.verbose = True  # 调试输出\n",
    "    maker.learn([Sentence.create(\"这里/r 有/v 关天培/nr 的/u 有关/vn 事迹/n 。/w\")])  # 学习一个句子\n",
    "    maker.saveTxtTo(MODEL)  # 输出HMM到txt\n",
    "\n",
    "\n",
    "def train(corpus, model):\n",
    "    dictionary = EasyDictionary.create(HanLP.Config.CoreDictionaryPath)  # 核心词典\n",
    "    maker = NRDictionaryMaker(dictionary)  # 训练模块\n",
    "    maker.train(corpus)  # 在语料库上训练\n",
    "    maker.saveTxtTo(model)  # 输出HMM到txt\n",
    "\n",
    "\n",
    "def load(model):\n",
    "    HanLP.Config.PersonDictionaryPath = model + \".txt\"  # data/test/nr.txt\n",
    "    HanLP.Config.PersonDictionaryTrPath = model + \".tr.txt\"  # data/test/nr.tr.txt\n",
    "    segment = DijkstraSegment()  # 该分词器便于调试\n",
    "    return segment\n",
    "\n",
    "\n",
    "def test():\n",
    "    segment = load(MODEL)\n",
    "    HanLP.Config.enableDebug()\n",
    "    print(segment.seg(\"龚学平等领导\"))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    demoNR()\n",
    "    train_one_sent()\n",
    "    train(pku.PKU199801, MODEL)\n",
    "    test()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_sp_ner.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-29 15:06\n",
    "# 《自然语言处理入门》8.5.3 基于感知机序列标注的命名实体识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07 import pku\n",
    "from tests.book.ch08.demo_hmm_ner import test, PerceptronSegmenter, PerceptronPOSTagger\n",
    "from tests.book.ch08.demo_role_tag_nr import Sentence\n",
    "\n",
    "NERTrainer = JClass('com.hankcs.hanlp.model.perceptron.NERTrainer')\n",
    "PerceptronNERecognizer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronNERecognizer')\n",
    "\n",
    "\n",
    "def train(corpus, model):\n",
    "    trainer = NERTrainer()\n",
    "    return PerceptronNERecognizer(trainer.train(corpus, model).getModel())\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    recognizer = train(pku.PKU199801_TRAIN, pku.NER_MODEL)\n",
    "    test(recognizer)\n",
    "    analyzer = PerceptronLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer)  # ①\n",
    "    analyzer.enableCustomDictionary(False)\n",
    "    sentence = Sentence.create(\"与/c 特朗普/nr 通/v 电话/n 讨论/v [太空/s 探索/vn 技术/n 公司/n]/nt\")  # ②\n",
    "    while not analyzer.analyze(sentence.text()).equals(sentence):  # ③\n",
    "        analyzer.learn(sentence)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_msra_ner.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-29 15:06\n",
    "# 《自然语言处理入门》8.5.3 基于感知机序列标注的命名实体识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07.demo_perceptron_pos import POSTrainer\n",
    "from tests.book.ch08.demo_hmm_ner import PerceptronSegmenter, PerceptronPOSTagger\n",
    "from tests.book.ch08.demo_sp_ner import PerceptronNERecognizer, NERTrainer\n",
    "from tests.book.ch08.msra_ner import MSRA_NER_TRAIN\n",
    "\n",
    "\n",
    "def train_ner(corpus):\n",
    "    model = os.path.join(os.path.dirname(corpus), 'ner.bin')\n",
    "    if os.path.isfile(model):\n",
    "        return PerceptronNERecognizer(model)\n",
    "    trainer = NERTrainer()\n",
    "    trainer.tagSet.nerLabels.clear()  # 不识别nr、ns、nt\n",
    "    trainer.tagSet.nerLabels.addAll(\n",
    "        [\"AGE\", \"ANGLE\", \"AREA\", \"CAPACTITY\", \"DATE\", \"DECIMAL\", \"DURATION\", \"FRACTION\", \"FREQUENCY\", \"INTEGER\",\n",
    "         \"LENGTH\", \"LOCATION\", \"MEASURE\", \"MONEY\", \"ORDINAL\", \"ORGANIZATION\", \"PERCENT\", \"PERSON\", \"PHONE\",\n",
    "         \"POSTALCODE\", \"RATE\", \"SPEED\", \"TEMPERATURE\", \"TIME\", \"WEIGHT\", \"WWW\"])\n",
    "    return PerceptronNERecognizer(trainer.train(corpus, model).getModel())\n",
    "\n",
    "\n",
    "def train_pos(corpus):\n",
    "    model = os.path.join(os.path.dirname(corpus), 'pos.bin')\n",
    "    if os.path.isfile(model):\n",
    "        return PerceptronPOSTagger(model)\n",
    "    trainer = POSTrainer()\n",
    "    return PerceptronPOSTagger(trainer.train(corpus, model).getModel())\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    tagger = train_pos(MSRA_NER_TRAIN)\n",
    "    recognizer = train_ner(MSRA_NER_TRAIN)\n",
    "    analyzer = PerceptronLexicalAnalyzer(PerceptronSegmenter(), tagger, recognizer)\n",
    "    analyzer.enableCustomDictionary(False)\n",
    "    print(analyzer.analyze('2008年5月20日山东大连气温30多摄氏度，王莲香首场赢下李钊颖，中国女队有机会赢下韩国队'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_hmm_ner.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-28 20:59\n",
    "# 《自然语言处理入门》8.5.2 基于隐马尔可夫模型序列标注的 命名实体识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch07 import pku\n",
    "\n",
    "HMMNERecognizer = JClass('com.hankcs.hanlp.model.hmm.HMMNERecognizer')\n",
    "AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')\n",
    "PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')\n",
    "PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')\n",
    "Utility = JClass('com.hankcs.hanlp.model.perceptron.utility.Utility')\n",
    "\n",
    "\n",
    "def train(corpus):\n",
    "    recognizer = HMMNERecognizer()\n",
    "    recognizer.train(corpus)  # data/test/pku98/199801-train.txt\n",
    "    return recognizer\n",
    "\n",
    "\n",
    "def test(recognizer):\n",
    "    word_array = [\"华北\", \"电力\", \"公司\"]  # 构造单词序列\n",
    "    pos_array = [\"ns\", \"n\", \"n\"]  # 构造词性序列\n",
    "    ner_array = recognizer.recognize(word_array, pos_array)  # 序列标注\n",
    "    for word, tag, ner in zip(word_array, pos_array, ner_array):\n",
    "        print(\"%s\\t%s\\t%s\\t\" % (word, tag, ner))\n",
    "    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer)\n",
    "    print(analyzer.analyze(\"华北电力公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观\"))\n",
    "    scores = Utility.evaluateNER(recognizer, pku.PKU199801_TEST)\n",
    "    Utility.printNERScore(scores)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    recognizer = train(pku.PKU199801_TRAIN)\n",
    "    test(recognizer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_plane.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-29 23:24\n",
    "# 《自然语言处理入门》8.6 自定义领域命名实体识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from tests.book.ch05.perceptron_cws import CWSTrainer\n",
    "from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter\n",
    "from tests.book.ch07.demo_perceptron_pos import PerceptronPOSTagger\n",
    "from tests.book.ch08.demo_sp_ner import NERTrainer, os, PerceptronNERecognizer\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "PLANE_ROOT = ensure_data(\"plane-re\", \"http://file.hankcs.com/corpus/plane-re.zip\")\n",
    "PLANE_CORPUS = os.path.join(PLANE_ROOT, 'train.txt')\n",
    "PLANE_MODEL = os.path.join(PLANE_ROOT, 'model.bin')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    trainer = NERTrainer()\n",
    "    trainer.tagSet.nerLabels.clear()  # 不识别nr、ns、nt\n",
    "    trainer.tagSet.nerLabels.add(\"np\")  # 目标是识别np\n",
    "    recognizer = PerceptronNERecognizer(trainer.train(PLANE_CORPUS, PLANE_MODEL).getModel())\n",
    "    # 在NER预测前，需要一个分词器，最好训练自同源语料库\n",
    "    CWS_MODEL = CWSTrainer().train(PLANE_CORPUS, PLANE_MODEL.replace('model.bin', 'cws.bin')).getModel()\n",
    "    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(CWS_MODEL), PerceptronPOSTagger(), recognizer)\n",
    "    print(analyzer.analyze(\"米高扬设计米格-17PF：米格-17PF型战斗机比米格-17P性能更好。\"))\n",
    "    print(analyzer.analyze(\"米格-阿帕奇-666S横空出世。\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch08/demo_role_tag_ns.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-27 11:17\n",
    "# 《自然语言处理入门》8.4.2 基于角色标注的地名识别\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.ngram_segment import DijkstraSegment\n",
    "from tests.book.ch07 import pku\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "EasyDictionary = JClass('com.hankcs.hanlp.corpus.dictionary.EasyDictionary')\n",
    "NSDictionaryMaker = JClass('com.hankcs.hanlp.corpus.dictionary.NSDictionaryMaker')\n",
    "Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence')\n",
    "MODEL = test_data_path() + \"/ns\"\n",
    "\n",
    "\n",
    "def train(corpus, model):\n",
    "    dictionary = EasyDictionary.create(HanLP.Config.CoreDictionaryPath)  # 核心词典\n",
    "    maker = NSDictionaryMaker(dictionary)  # 训练模块\n",
    "    maker.train(corpus)  # 在语料库上训练\n",
    "    maker.saveTxtTo(model)  # 输出HMM到txt\n",
    "\n",
    "\n",
    "def load(model):\n",
    "    HanLP.Config.PlaceDictionaryPath = model + \".txt\"  # data/test/ns.txt\n",
    "    HanLP.Config.PlaceDictionaryTrPath = model + \".tr.txt\"  # data/test/ns.tr.txt\n",
    "    segment = DijkstraSegment().enablePlaceRecognize(True).enableCustomDictionary(False)  # 该分词器便于调试\n",
    "    return segment\n",
    "\n",
    "\n",
    "def test(model=MODEL):\n",
    "    segment = load(model)\n",
    "    HanLP.Config.enableDebug()\n",
    "    print(segment.seg(\"生于黑牛沟村\"))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    train(pku.PKU199801, MODEL)\n",
    "    test(MODEL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第09章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch09/demo_extract_word.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-07-30 21:03\n",
    "# 《自然语言处理入门》9.1 新词提取\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "HLM_PATH = ensure_data(\"红楼梦.txt\", \"http://file.hankcs.com/corpus/红楼梦.zip\")\n",
    "XYJ_PATH = ensure_data(\"西游记.txt\", \"http://file.hankcs.com/corpus/西游记.zip\")\n",
    "SHZ_PATH = ensure_data(\"水浒传.txt\", \"http://file.hankcs.com/corpus/水浒传.zip\")\n",
    "SAN_PATH = ensure_data(\"三国演义.txt\", \"http://file.hankcs.com/corpus/三国演义.zip\")\n",
    "WEIBO_PATH = ensure_data(\"weibo-classification\", \"http://file.hankcs.com/corpus/weibo-classification.zip\")\n",
    "\n",
    "\n",
    "def test_weibo():\n",
    "    for folder in os.listdir(WEIBO_PATH):\n",
    "        print(folder)\n",
    "        big_text = \"\"\n",
    "        for file in os.listdir(os.path.join(WEIBO_PATH, folder)):\n",
    "            with open(os.path.join(WEIBO_PATH, folder, file), encoding='utf-8') as src:\n",
    "                big_text += \"\".join(src.readlines())\n",
    "        word_info_list = HanLP.extractWords(big_text, 100)\n",
    "        print(word_info_list)\n",
    "\n",
    "\n",
    "def extract(corpus):\n",
    "    print(\"%s 热词\" % corpus)\n",
    "    word_info_list = HanLP.extractWords(IOUtil.newBufferedReader(corpus), 100)\n",
    "    print(word_info_list)\n",
    "    # print(\"%s 新词\" % corpus)\n",
    "    # word_info_list = HanLP.extractWords(IOUtil.newBufferedReader(corpus), 100, True)\n",
    "    # print(word_info_list)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    extract(HLM_PATH)\n",
    "    extract(XYJ_PATH)\n",
    "    extract(SHZ_PATH)\n",
    "    extract(SAN_PATH)\n",
    "    test_weibo()\n",
    "\n",
    "    # 更多参数\n",
    "    word_info_list = HanLP.extractWords(IOUtil.newBufferedReader(HLM_PATH), 100, True, 4, 0.0, .5, 100)\n",
    "    print(word_info_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch09/demo_term_freq.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-08-01 16:15\n",
    "# 《自然语言处理入门》9.2 关键词提取\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "TermFrequency = JClass('com.hankcs.hanlp.corpus.occurrence.TermFrequency')\n",
    "TermFrequencyCounter = JClass('com.hankcs.hanlp.mining.word.TermFrequencyCounter')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    counter = TermFrequencyCounter()\n",
    "    counter.add(\"加油加油中国队！\")  # 第一个文档\n",
    "    counter.add(\"中国观众高呼加油中国\")  # 第二个文档\n",
    "    for termFrequency in counter:  # 遍历每个词与词频\n",
    "        print(\"%s=%d\" % (termFrequency.getTerm(), termFrequency.getFrequency()))\n",
    "    print(counter.top(2))  # 取 top N\n",
    "\n",
    "    #  根据词频提取关键词\n",
    "    print(TermFrequencyCounter.getKeywordList(\"女排夺冠，观众欢呼女排女排女排！\", 3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch09/demo_tfidf.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-08-01 18:25\n",
    "# 《自然语言处理入门》9.2 关键词提取\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "TfIdfCounter = JClass('com.hankcs.hanlp.mining.word.TfIdfCounter')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    counter = TfIdfCounter()\n",
    "    counter.add(\"《女排夺冠》\", \"女排北京奥运会夺冠\")  # 输入多篇文档\n",
    "    counter.add(\"《羽毛球男单》\", \"北京奥运会的羽毛球男单决赛\")\n",
    "    counter.add(\"《女排》\", \"中国队女排夺北京奥运会金牌重返巅峰，观众欢呼女排女排女排！\")\n",
    "    counter.compute()  # 输入完毕\n",
    "    for id in counter.documents():\n",
    "        print(id + \" : \" + counter.getKeywordsOf(id, 3).toString())  # 根据每篇文档的TF-IDF提取关键词\n",
    "    # 根据语料库已有的IDF信息为语料库之外的新文档提取关键词\n",
    "    print(counter.getKeywords(\"奥运会反兴奋剂\", 2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第10章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch10/demo_text_clustering.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-08-18 23:26\n",
    "# 《自然语言处理入门》第 10 章 文本聚类\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "ClusterAnalyzer = JClass('com.hankcs.hanlp.mining.cluster.ClusterAnalyzer')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    analyzer = ClusterAnalyzer()\n",
    "    analyzer.addDocument(\"赵一\", \"流行, 流行, 流行, 流行, 流行, 流行, 流行, 流行, 流行, 流行, 蓝调, 蓝调, 蓝调, 蓝调, 蓝调, 蓝调, 摇滚, 摇滚, 摇滚, 摇滚\")\n",
    "    analyzer.addDocument(\"钱二\", \"爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲\")\n",
    "    analyzer.addDocument(\"张三\", \"古典, 古典, 古典, 古典, 民谣, 民谣, 民谣, 民谣\")\n",
    "    analyzer.addDocument(\"李四\", \"爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 爵士, 金属, 金属, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲, 舞曲\")\n",
    "    analyzer.addDocument(\"王五\", \"流行, 流行, 流行, 流行, 摇滚, 摇滚, 摇滚, 嘻哈, 嘻哈, 嘻哈\")\n",
    "    analyzer.addDocument(\"马六\", \"古典, 古典, 古典, 古典, 古典, 古典, 古典, 古典, 摇滚\")\n",
    "    print(analyzer.kmeans(3))\n",
    "    print(analyzer.repeatedBisection(3))\n",
    "    print(analyzer.repeatedBisection(1.0))  # 自动判断聚类数量k"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch10/demo_clustering_f.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author：hankcs\n",
    "# Date: 2018-08-19 20:01\n",
    "# 《自然语言处理入门》10.5 标准化评测\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.demos.demo_text_classification import sogou_corpus_path\n",
    "\n",
    "ClusterAnalyzer = JClass('com.hankcs.hanlp.mining.cluster.ClusterAnalyzer')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    for algorithm in \"kmeans\", \"repeated bisection\":\n",
    "        print(\"%s F1=%.2f\\n\" % (algorithm, ClusterAnalyzer.evaluate(sogou_corpus_path, algorithm) * 100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch10/demo_get_bow_vec.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2020-07-31 20:55\n",
    "# 《自然语言处理入门》第 10 章 文本聚类 （这段代码来自书籍之外的附赠答疑）\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "import os\n",
    "\n",
    "from pyhanlp.static import STATIC_ROOT, HANLP_JAR_PATH\n",
    "\n",
    "java_code_path = os.path.join(STATIC_ROOT, 'MyClusterAnalyzer.java')\n",
    "with open(java_code_path, 'w') as out:\n",
    "    java_code = \"\"\"\n",
    "import com.hankcs.hanlp.mining.cluster.ClusterAnalyzer;\n",
    "import com.hankcs.hanlp.mining.cluster.SparseVector;\n",
    "\n",
    "public class MyClusterAnalyzer<K> extends ClusterAnalyzer<K>\n",
    "{\n",
    "    public SparseVector toVector(String document)\n",
    "    {\n",
    "        return toVector(preprocess(document));\n",
    "    }\n",
    "}\n",
    "\"\"\"\n",
    "    out.write(java_code)\n",
    "os.system('javac -cp {} {} -d {}'.format(HANLP_JAR_PATH, java_code_path, STATIC_ROOT))\n",
    "# 编译结束才可以启动hanlp\n",
    "from pyhanlp import *\n",
    "\n",
    "ClusterAnalyzer = JClass('MyClusterAnalyzer')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    analyzer = ClusterAnalyzer()\n",
    "    vec = analyzer.toVector(\"古典, 古典, 古典, 古典, 古典, 古典, 古典, 古典, 摇滚\")\n",
    "    print(vec)\n",
    "    # print(analyzer.kmeans(3))\n",
    "    # print(analyzer.repeatedBisection(3))\n",
    "    # print(analyzer.repeatedBisection(1.0))  # 自动判断聚类数量k"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第11章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch11/demo_load_text_classification_corpus.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-01-03 19:36\n",
    "# 《自然语言处理入门》11.2 文本分类语料库\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.demos.demo_text_classification import sogou_corpus_path\n",
    "\n",
    "AbstractDataSet = JClass('com.hankcs.hanlp.classification.corpus.AbstractDataSet')\n",
    "Document = JClass('com.hankcs.hanlp.classification.corpus.Document')\n",
    "FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')\n",
    "MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')\n",
    "\n",
    "# 演示加载文本分类语料库\n",
    "if __name__ == '__main__':\n",
    "    dataSet = MemoryDataSet()  # ①将数据集加载到内存中\n",
    "    dataSet.load(sogou_corpus_path)  # ②加载data/test/搜狗文本分类语料库迷你版\n",
    "    dataSet.add(\"自然语言处理\", \"自然语言处理很有趣\")  # ③新增样本\n",
    "    allClasses = dataSet.getCatalog().getCategories()  # ④获取标注集\n",
    "    print(\"标注集：%s\" % (allClasses))\n",
    "    for document in dataSet.iterator():\n",
    "        print(\"第一篇文档的类别：\" + allClasses.get(document.category))\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch11/demo_text_classification.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-01-04 20:28\n",
    "# 《自然语言处理入门》11.4.2 朴素贝叶斯文本分类器实现\n",
    "# 请参考tests/demos/demo_text_classification.py\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch11/demo_svm_text_classification.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-01-06 14:09\n",
    "# 《自然语言处理入门》11.5.2 线性支持向量机文本分类器实现\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp.static import STATIC_ROOT, download\n",
    "import os\n",
    "\n",
    "from tests.demos.demo_text_classification import sogou_corpus_path\n",
    "\n",
    "\n",
    "def install_jar(name, url):\n",
    "    dst = os.path.join(STATIC_ROOT, name)\n",
    "    if os.path.isfile(dst):\n",
    "        return dst\n",
    "    download(url, dst)\n",
    "    return dst\n",
    "\n",
    "\n",
    "install_jar('text-classification-svm-1.0.2.jar', 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')\n",
    "install_jar('liblinear-1.95.jar', 'http://file.hankcs.com/bin/liblinear-1.95.jar')\n",
    "from pyhanlp import *\n",
    "\n",
    "LinearSVMClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')\n",
    "IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')\n",
    "\n",
    "\n",
    "def train_or_load_classifier():\n",
    "    model_path = sogou_corpus_path + '.svm.ser'\n",
    "    if os.path.isfile(model_path):\n",
    "        return LinearSVMClassifier(IOUtil.readObjectFrom(model_path))\n",
    "    classifier = LinearSVMClassifier()\n",
    "    classifier.train(sogou_corpus_path)\n",
    "    model = classifier.getModel()\n",
    "    IOUtil.saveObjectTo(model, model_path)\n",
    "    return LinearSVMClassifier(model)\n",
    "\n",
    "\n",
    "def predict(classifier, text):\n",
    "    print(\"《%16s》\\t属于分类\\t【%s】\" % (text, classifier.classify(text)))\n",
    "    # 如需获取离散型随机变量的分布，请使用predict接口\n",
    "    # print(\"《%16s》\\t属于分类\\t【%s】\" % (text, classifier.predict(text)))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    classifier = train_or_load_classifier()\n",
    "    predict(classifier, \"C罗获2018环球足球奖最佳球员 德尚荣膺最佳教练\")\n",
    "    predict(classifier, \"潜艇具有很强的战略威慑能力与实战能力\")\n",
    "    predict(classifier, \"研究生考录模式亟待进一步专业化\")\n",
    "    predict(classifier, \"如果真想用食物解压,建议可以食用燕麦\")\n",
    "    predict(classifier, \"通用及其部分竞争对手目前正在考虑解决库存问题\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch11/demo_text_classification_evaluation.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-01-04 20:28\n",
    "# 《自然语言处理入门》11.6 标准化评测\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "from tests.demos.demo_text_classification import sogou_corpus_path\n",
    "\n",
    "IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')\n",
    "NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')\n",
    "LinearSVMClassifier = JClass('com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')\n",
    "FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')\n",
    "IDataSet = JClass('com.hankcs.hanlp.classification.corpus.IDataSet')\n",
    "MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')\n",
    "Evaluator = JClass('com.hankcs.hanlp.classification.statistics.evaluations.Evaluator')\n",
    "FMeasure = JClass('com.hankcs.hanlp.classification.statistics.evaluations.FMeasure')\n",
    "BigramTokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.BigramTokenizer')\n",
    "HanLPTokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer')\n",
    "ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer')\n",
    "\n",
    "\n",
    "def evaluate(classifier, tokenizer):\n",
    "    training_corpus = FileDataSet().setTokenizer(tokenizer).load(sogou_corpus_path, \"UTF-8\", 0.9)\n",
    "    classifier.train(training_corpus)\n",
    "    testing_corpus = MemoryDataSet(classifier.getModel()).load(sogou_corpus_path, \"UTF-8\", -0.1)\n",
    "    result = Evaluator.evaluate(classifier, testing_corpus)\n",
    "    print(classifier.getClass().getSimpleName() + \"+\" + tokenizer.getClass().getSimpleName())\n",
    "    print(result)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    evaluate(NaiveBayesClassifier(), HanLPTokenizer())\n",
    "    evaluate(NaiveBayesClassifier(), BigramTokenizer())\n",
    "    evaluate(LinearSVMClassifier(), HanLPTokenizer())\n",
    "    evaluate(LinearSVMClassifier(), BigramTokenizer())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第12章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch12/demo_train_parser.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-02-11 23:18\n",
    "# 《自然语言处理入门》12.5.1 训练模型\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.test_utility import ensure_data\n",
    "\n",
    "KBeamArcEagerDependencyParser = JClass('com.hankcs.hanlp.dependency.perceptron.parser.KBeamArcEagerDependencyParser')\n",
    "CTB_ROOT = ensure_data(\"ctb8.0-dep\", \"http://file.hankcs.com/corpus/ctb8.0-dep.zip\")\n",
    "CTB_TRAIN = CTB_ROOT + \"/train.conll\"\n",
    "CTB_DEV = CTB_ROOT + \"/dev.conll\"\n",
    "CTB_TEST = CTB_ROOT + \"/test.conll\"\n",
    "CTB_MODEL = CTB_ROOT + \"/ctb.bin\"\n",
    "BROWN_CLUSTER = ensure_data(\"wiki-cn-cluster.txt\", \"http://file.hankcs.com/corpus/wiki-cn-cluster.zip\")\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    parser = KBeamArcEagerDependencyParser.train(CTB_TRAIN, CTB_DEV, BROWN_CLUSTER, CTB_MODEL)\n",
    "    print(parser.parse(\"人吃鱼\"))\n",
    "    score = parser.evaluate(CTB_TEST)\n",
    "    print(\"UAS=%.1f LAS=%.1f\\n\" % (score[0], score[1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch12/opinion_mining.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-06-02 18:03\n",
    "# 《自然语言处理入门》12.6 案例:基于依存句法树的意见抽取\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "\n",
    "CoNLLSentence = JClass('com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence')\n",
    "CoNLLWord = JClass('com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord')\n",
    "IDependencyParser = JClass('com.hankcs.hanlp.dependency.IDependencyParser')\n",
    "KBeamArcEagerDependencyParser = JClass('com.hankcs.hanlp.dependency.perceptron.parser.KBeamArcEagerDependencyParser')\n",
    "\n",
    "\n",
    "def main():\n",
    "    parser = KBeamArcEagerDependencyParser()\n",
    "    tree = parser.parse(\"电池非常棒，机身不长，长的是待机，但是屏幕分辨率不高。\")\n",
    "    print(tree)\n",
    "    print(\"第一版\")\n",
    "    extactOpinion1(tree)\n",
    "    print(\"第二版\")\n",
    "    extactOpinion2(tree)\n",
    "    print(\"第三版\")\n",
    "    extactOpinion3(tree)\n",
    "\n",
    "\n",
    "def extactOpinion1(tree):\n",
    "    for word in tree.iterator():\n",
    "        if word.POSTAG == \"NN\" and word.DEPREL == \"nsubj\":\n",
    "            print(\"%s = %s\" % (word.LEMMA, word.HEAD.LEMMA))\n",
    "\n",
    "\n",
    "def extactOpinion2(tree):\n",
    "    for word in tree.iterator():\n",
    "        if word.POSTAG == \"NN\" and word.DEPREL == \"nsubj\":\n",
    "            if tree.findChildren(word.HEAD, \"neg\").isEmpty():\n",
    "                print(\"%s = %s\" % (word.LEMMA, word.HEAD.LEMMA))\n",
    "            else:\n",
    "                print(\"%s = 不%s\" % (word.LEMMA, word.HEAD.LEMMA))\n",
    "\n",
    "\n",
    "def extactOpinion3(tree):\n",
    "    for word in tree.iterator():\n",
    "        if word.POSTAG == \"NN\":\n",
    "            if word.DEPREL == \"nsubj\":  # ①属性\n",
    "\n",
    "                if tree.findChildren(word.HEAD, \"neg\").isEmpty():\n",
    "                    print(\"%s = %s\" % (word.LEMMA, word.HEAD.LEMMA))\n",
    "                else:\n",
    "                    print(\"%s = 不%s\" % (word.LEMMA, word.HEAD.LEMMA))\n",
    "            elif word.DEPREL == \"attr\":\n",
    "                top = tree.findChildren(word.HEAD, \"top\")  # ②主题\n",
    "\n",
    "                if not top.isEmpty():\n",
    "                    print(\"%s = %s\" % (word.LEMMA, top.get(0).LEMMA))\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第13章"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch13/demo_word2vec.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-02-26 19:59\n",
    "# 《自然语言处理入门》13.3 word2vec\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from pyhanlp import *\n",
    "from tests.book.ch03.msr import msr_train\n",
    "from tests.test_utility import test_data_path\n",
    "\n",
    "IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')\n",
    "DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel')\n",
    "Word2VecTrainer = JClass('com.hankcs.hanlp.mining.word2vec.Word2VecTrainer')\n",
    "WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel')\n",
    "\n",
    "# 演示词向量的训练与应用\n",
    "TRAIN_FILE_NAME = msr_train\n",
    "MODEL_FILE_NAME = os.path.join(test_data_path(), \"word2vec.txt\")\n",
    "\n",
    "\n",
    "def print_nearest(word, model):\n",
    "    print(\n",
    "        \"\\n                                                Word     \"\n",
    "        \"Cosine\\n------------------------------------------------------------------------\")\n",
    "    for entry in model.nearest(word):\n",
    "        print(\"%50s\\t\\t%f\" % (entry.getKey(), entry.getValue()))\n",
    "\n",
    "\n",
    "def print_nearest_document(document, documents, model):\n",
    "    print_header(document)\n",
    "    for entry in model.nearest(document):\n",
    "        print(\"%50s\\t\\t%f\" % (documents[entry.getKey()], entry.getValue()))\n",
    "\n",
    "\n",
    "def print_header(query):\n",
    "    print(\n",
    "        \"\\n%50s          Cosine\\n------------------------------------------------------------------------\" % (query))\n",
    "\n",
    "\n",
    "def train_or_load_model():\n",
    "    if not IOUtil.isFileExisted(MODEL_FILE_NAME):\n",
    "        if not IOUtil.isFileExisted(TRAIN_FILE_NAME):\n",
    "            raise RuntimeError(\"语料不存在，请阅读文档了解语料获取与格式：https://github.com/hankcs/HanLP/wiki/word2vec\")\n",
    "        trainerBuilder = Word2VecTrainer();\n",
    "        return trainerBuilder.train(TRAIN_FILE_NAME, MODEL_FILE_NAME)\n",
    "    return load_model()\n",
    "\n",
    "\n",
    "def load_model():\n",
    "    return WordVectorModel(MODEL_FILE_NAME)\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    wordVectorModel = train_or_load_model()\n",
    "    print_nearest(\"上海\", wordVectorModel)\n",
    "    print_nearest(\"美丽\", wordVectorModel)\n",
    "    print_nearest(\"购买\", wordVectorModel)\n",
    "    print(wordVectorModel.similarity(\"上海\", \"广州\"))\n",
    "    print(wordVectorModel.analogy(\"日本\", \"自民党\", \"共和党\"))\n",
    "    #  文档向量\n",
    "    docVectorModel = DocVectorModel(wordVectorModel)\n",
    "    documents = [\"山东苹果丰收\",\n",
    "                 \"农民在江苏种水稻\",\n",
    "                 \"奥运会女排夺冠\",\n",
    "                 \"世界锦标赛胜出\",\n",
    "                 \"中国足球失败\", ]\n",
    "    print(docVectorModel.similarity(\"山东苹果丰收\", \"农民在江苏种水稻\"))\n",
    "    print(docVectorModel.similarity(\"山东苹果丰收\", \"世界锦标赛胜出\"))\n",
    "    print(docVectorModel.similarity(documents[0], documents[1]))\n",
    "    print(docVectorModel.similarity(documents[0], documents[4]))\n",
    "    for i, d in enumerate(documents):\n",
    "        docVectorModel.addDocument(i, documents[i])\n",
    "    print_nearest_document(\"体育\", documents, docVectorModel)\n",
    "    print_nearest_document(\"农业\", documents, docVectorModel)\n",
    "    print_nearest_document(\"我要看比赛\", documents, docVectorModel)\n",
    "    print_nearest_document(\"要不做饭吧\", documents, docVectorModel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch13/demo_neual_parser.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-02-26 22:58\n",
    "# 《自然语言处理入门》13.4 基于神经网络的高性能依存句法分析器\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "from pyhanlp import *\n",
    "\n",
    "CoNLLSentence = JClass('com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence')\n",
    "CoNLLWord = JClass('com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord')\n",
    "IDependencyParser = JClass('com.hankcs.hanlp.dependency.IDependencyParser')\n",
    "NeuralNetworkDependencyParser = JClass('com.hankcs.hanlp.dependency.nnparser.NeuralNetworkDependencyParser')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    parser = NeuralNetworkDependencyParser()\n",
    "    sentence = parser.parse(\"徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。\")\n",
    "    print(sentence)\n",
    "    for word in sentence.iterator():  # 通过dir()可以查看sentence的方法\n",
    "        print(\"%s --(%s)--> %s\" % (word.LEMMA, word.DEPREL, word.HEAD.LEMMA))\n",
    "    print()\n",
    "\n",
    "    # 也可以直接拿到数组，任意顺序或逆序遍历\n",
    "    word_array = sentence.getWordArray()\n",
    "    for word in word_array:\n",
    "        print(\"%s --(%s)--> %s\" % (word.LEMMA, word.DEPREL, word.HEAD.LEMMA))\n",
    "    print()\n",
    "\n",
    "    # 还可以直接遍历子树，从某棵子树的某个节点一路遍历到虚根\n",
    "    CoNLLWord = JClass(\"com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord\")\n",
    "    head = word_array[12]\n",
    "    while head.HEAD:\n",
    "        head = head.HEAD\n",
    "        if (head == CoNLLWord.ROOT):\n",
    "            print(head.LEMMA)\n",
    "        else:\n",
    "            print(\"%s --(%s)--> \" % (head.LEMMA, head.DEPREL))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### tests/book/ch13/sigmoid.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Author: hankcs\n",
    "# Date: 2019-02-26 15:16\n",
    "# 《自然语言处理入门》13.2 深度学习与优势\n",
    "# 配套书籍：http://nlp.hankcs.com/book.php\n",
    "# 讨论答疑：https://bbs.hankcs.com/\n",
    "\n",
    "from matplotlib import pylab as plt\n",
    "import numpy as np\n",
    "\n",
    "sigmoid = lambda x: 1 / (1 + np.exp(-x))\n",
    "\n",
    "x = plt.linspace(-10, 10, 10000)\n",
    "\n",
    "plt.plot(x, sigmoid(x), 'b')\n",
    "plt.grid()\n",
    "\n",
    "plt.title(r'$\\sigma(x)=\\frac{1}{1+e^{-x}}$')\n",
    "plt.xlabel('x')\n",
    "plt.ylabel('y')\n",
    "plt.savefig('sigmoid.png')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}